# Downloading data directly from Google

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

In [None]:
'''
Follow directions here to create an API to access the data hosted on 
the drive: https://pythonhosted.org/PyDrive/quickstart.html

Once the client_secrets.json file is in ./Marin Workspace/ run the code 
below just once
'''
gauth = GoogleAuth()
gauth.LocalWebserverAuth()

In [None]:
'''
Run the following code to download the data using the file's ID
'''
SP500_sectors_filled = '1S6lRlfRRVJT2pH_fLBgX9ZbWjroD-DSZ'
drive = GoogleDrive(gauth)
data = drive.CreateFile({'id': all_stocks_5yr})
data.GetContentFile('SP500_sectors_filled.csv')

# Load stock data and sector data

In [None]:
import time
import numpy as np
import pandas as pd
from pandas.plotting import lag_plot
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
stock_data = pd.read_csv('all_stocks_5yr.csv', parse_dates=['date'])
sector_data = pd.read_csv('sectors.csv')
sp_filled = pd.read_csv('SP500_sectors_filled.csv', parse_dates=['date'])
beta = pd.read_csv('PR_B.csv')

In [None]:
def combine_dataframes(stock_df, sector_df):
    '''
    First parameter must be the stock dataframe and the second paramater must be the sector 
    dataframe. This will return a dataframe matching stocks to their sectors
    '''
    
    stock_df.set_index('Name', inplace=True)
    sector_df.set_index('Symbol', inplace=True)
    
    df = pd.concat([stock_df, sector_df], axis=1, join_axes=[stock_df.index])
    
    df.rename(columns={'Name':'full_name'}, inplace=True)
    df.reset_index(inplace=True)
    df.set_index('date', inplace=True)
    
    return df

# Dense Sequential Model - Helper Functions

In [None]:
import sklearn.linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint


In [None]:
def remove_nulls(df):
    '''
    Takes in a dataframe and returns a mutated dataframe with rows of missing data dropped.
    '''
    
    df.dropna(inplace=True)
    
    return df

In [None]:
def drop_columns(df, *args):
    '''
    Takes in a dataframe and a list of column names to be removed.
    Useful for when feeding into a model that does not take in datetime or labels.
    Returns the mutated dataframe with the desired columns removed.
    '''
    
    df.drop(list(args), axis=1, inplace=True)
    
    return df

In [None]:
def encode_sectors(df, sector_column):
    '''
    Takes in the dataframe and the name of the column that describes the sectors.
    Mutates the dataframe with encoded sector labels and returns an array of the 
    encoded sectors. Required because Keras will not take in labels as strings.
    '''
    
    label_encoder = preprocessing.LabelEncoder()
    df[sector_column] = label_encoder.fit_transform(df[sector_column])
    
    return label_encoder.classes_ 

In [None]:
def model_inputs(df, sector_column):
    '''
    Takes in the dataframe and the name of the column that describes the sectors.
    Returns x and y values as float32 types to be fed into the keras model. 
    There must not be any str or datetime types in any of the columns.
    '''
    
    temp_df = df.drop(sector_column, axis=1)
    x = temp_df.values
    x = x.astype(np.float32)
    
    y = df[sector_column].values.astype(np.float32)
    y = y.reshape(-1, 1)
    y = to_categorical(y)
    
    return x, y

# Dense Sequential Model - Workflow

In [None]:
sp_filled.isnull().sum()

In [None]:
sp_filled = remove_nulls(sp_filled)
print(sp_filled.isnull().sum())

In [None]:
sp_filled = drop_columns(sp_filled, 'date', 'Name')
sectors_encoded = encode_sectors(sp_filled, 'Sector')
X, y = model_inputs(sp_filled, 'Sector')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
X_train = preprocessing.normalize(X_train, norm='l2')
X_test = preprocessing.normalize(X_test, norm='l2')

In [None]:
#sequential meaning nodes are connected in 
model = Sequential()

#hidden layer 1, input dimensionality
model.add(Dense(5, input_dim=X.shape[1], activation='relu'))

#hidden layer 2
model.add(Dense(5, activation='relu'))

#output layer (11 for 11 sectors)
model.add(Dense(y.shape[1], activation='softmax'))

#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#fit model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5)

# Dense Sequential Model - Accuracy

In [None]:
model_pred = model.predict(X_test)
model_pred = np.argmax(model_pred, axis=1)
y_test_model = np.argmax(y_test, axis=1)

accuracy_score = metrics.accuracy_score(y_test_model, model_pred)
print(accuracy_score)

In [None]:
confusion_matrix = confusion_matrix(y_test_model, model_pred)
print(confusion_matrix)

In [None]:
#Score of 16% accuracy when using L2 normalization