In [7]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas.plotting import lag_plot
import sklearn.linear_model
import time
%matplotlib inline

# Downloading data directly from Google

In [None]:
'''
Follow directions here to create an API to access the data hosted on 
the drive: https://pythonhosted.org/PyDrive/quickstart.html

Once the client_secrets.json file is in ./Marin Workspace/ run the code 
below just once
'''
gauth = GoogleAuth()
gauth.LocalWebserverAuth()

In [None]:
'''
Run the following code to download the data using the file's ID
'''
SP500_sectors_filled = '1S6lRlfRRVJT2pH_fLBgX9ZbWjroD-DSZ'
drive = GoogleDrive(gauth)
data = drive.CreateFile({'id': all_stocks_5yr})
data.GetContentFile('SP500_sectors_filled.csv')

# Load stock data and sector data

In [15]:
stock_data = pd.read_csv('all_stocks_5yr.csv', parse_dates = ['date'])
sector_data = pd.read_csv('sectors.csv')
sp_filled = pd.read_csv('SP500_sectors_filled.csv')
sp_metrics = pd.read_csv('sp_metrics.csv')

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 0-1: invalid continuation byte

In [11]:
sp_filled.head(5)

Unnamed: 0,date,open,high,low,close,volume,Name,per_change,Sector
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL,-0.021234,Industrials
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL,-0.028878,Industrials
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL,-0.012457,Industrials
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL,0.025175,Industrials
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL,-0.063588,Industrials


In [12]:
sp_filled.isnull().sum()

date           0
open          11
high           8
low            8
close          0
volume         0
Name           0
per_change    11
Sector         0
dtype: int64

In [13]:
sp_filled.dropna(inplace = True)

In [14]:
sp_filled.isnull().sum()

date          0
open          0
high          0
low           0
close         0
volume        0
Name          0
per_change    0
Sector        0
dtype: int64

In [5]:
def combine_dataframes(stock, sector):
    '''
    First parameter must be the stock dataframe and the second paramater must be the sector 
    dataframe. This will return a dataframe matching stocks to their sectors
    '''
    stock.set_index('Name', inplace = True)
    sector.set_index('Symbol', inplace = True)
    
    df = pd.concat([stock_data, sector_data], axis = 1, join_axes=[stock.index])
    
    df.rename(columns={'Name':'full_name'}, inplace = True)
    df.reset_index(inplace = True)
    df.set_index('date', inplace = True)
    
    return df

In [6]:
#Assuming SP_500 data set and sector data set are in the working directory
df = combine_dataframes(stock_data, sector_data)
df

Unnamed: 0_level_0,Name,open,high,low,close,volume,full_name,Sector
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-02-08,AAL,15.07,15.1200,14.6300,14.75,8407500,American Airlines Group,Industrials
2013-02-11,AAL,14.89,15.0100,14.2600,14.46,8882000,American Airlines Group,Industrials
2013-02-12,AAL,14.45,14.5100,14.1000,14.27,8126000,American Airlines Group,Industrials
2013-02-13,AAL,14.30,14.9400,14.2500,14.66,10259500,American Airlines Group,Industrials
2013-02-14,AAL,14.94,14.9600,13.1600,13.99,31879900,American Airlines Group,Industrials
2013-02-15,AAL,13.93,14.6100,13.9300,14.50,15628000,American Airlines Group,Industrials
2013-02-19,AAL,14.33,14.5600,14.0800,14.26,11354400,American Airlines Group,Industrials
2013-02-20,AAL,14.17,14.2600,13.1500,13.33,14725200,American Airlines Group,Industrials
2013-02-21,AAL,13.62,13.9500,12.9000,13.37,11922100,American Airlines Group,Industrials
2013-02-22,AAL,13.57,13.6000,13.2100,13.57,6071400,American Airlines Group,Industrials


In [None]:
stock_data.head(5)

In [None]:
sector_data.head(5)

In [None]:
#Check for empty entries
stock_data.isnull().sum()

In [None]:
#Check for empty entries
sector_data.isnull().sum()

In [None]:
#Drop rows with empty values
stock_data.dropna(inplace = True) 

In [None]:
#Find the intersection of tickers between the two data sets
#Findings: there's missing data
stocks = set(stock_data.Name.unique())
sector = set(sector_data.Symbol.unique())

start = time.time()
intersection = [symbol for symbol in stocks if symbol in sector]
end = time.time()
print(len(intersection))
print(end-start)

start = time.time()
intersection = stocks.intersection(sector)
print(len(intersection))
end = time.time()
print(end-start)   

In [None]:
#Which stocks are missing from both data sets
stocks.symmetric_difference(sector)

* CBG -> CBRE
* HCN -> WELL
* SNI -> AQUIRED BY Discovery Communications (Drop)
* CHK -> REMOVED DUE TO MARKET CAP CHANGES (Drop) Energy Company
* PDCO -> REMOVED DUE TO MARKET CAP CHANGES (Drop)
* SIG -> REMOVED DUE TO MARKET CAP CHANGES (Drop)
* PCLN -> no data (?)

In [None]:
#Create dictionary to replace old names with new
replace_tickers = {'CBG': 'CBRE', 'HCN': 'WELL'}
stock_data.replace(replace_tickers, inplace = True)

In [None]:
aapl_df = df[df['Name'] == 'AAPL']

X_train = (aapl_df.index - aapl_df.index[0]).days.values[0:629].reshape(-1,1)
y_train = aapl_df['close'].values[0:629].reshape(-1,1)

X_test = (aapl_df.index - aapl_df.index[0]).days.values[629:-1].reshape(-1,1)
y_test = aapl_df['close'].values[629:-1].reshape(-1,1)

lin_reg_model = sklearn.linear_model.LinearRegression()

lin_reg_model.fit(X_train, y_train)

In [None]:
df3 = combine_dataframes(stock_data, sector_data)

In [None]:
df3

In [None]:
#Testing tensorflow

In [None]:
import tensorflow as tf

In [None]:
x = tf.Variable(3, name = 'x')
y = tf.Variable(4, name = 'y')
f = x*x*y + y + 2

In [None]:
sess = tf.Session()

In [None]:
sess.run(x.initializer)
sess.run(y.initializer)
result = sess.run(f)

In [None]:
print(result)
sess.close()

In [None]:
init = tf.global_variables_initializer()

In [None]:
sess = tf.InteractiveSession()
init.run()
result = f.eval()
print(result)
#sess.close()

In [None]:
x1 = tf.Variable(1)

In [None]:
x1.graph is tf.get_default_graph()

In [None]:
graph = tf.Graph()

with graph.as_default():
    x2 = tf.Variable(2)

x2.graph is tf.get_default_graph()

In [None]:
x2.graph is graph

In [None]:
tf.reset_default_graph()

In [None]:
from sklearn.datasets import fetch_california_housing

In [None]:
housing = fetch_california_housing()

In [None]:
m, n = housing.data.shape

In [None]:
housing_data_plus_bias = np.c_[np.ones((m,1)), housing.data]

In [None]:
X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

In [None]:
with tf.Session() as sess:
    theta_value = theta.eval()

In [None]:
theta_value

In [None]:
print(X.shape)

In [None]:
print(y.shape)

In [None]:
print(housing.data.shape)