In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('fundamentals.csv')

In [None]:
#Remove all columns that have "Current" in them, we will not be using those for our NN
cols = [c for c in df.columns if 'Current' not in c]
df2 = df[cols].drop(['Unnamed: 0'], axis = 1)

In [None]:
#list(df2.columns)

In [None]:
#Drop missing data
df2 = df2.dropna()
df2.head()

In [None]:
#SP500 w/ sectors
df3 = pd.read_csv('SP500_sectors_filled2.csv', index_col = 'date', parse_dates=["date"])

In [None]:
#Swap date and Name for index
df3['date'] = df3.index
df3.index = df3['Name']
df3 = df3.drop(['Name'],axis = 1)

In [None]:
df3.head()

In [None]:
#get same company names
sp500_names = df3.index.unique()
fundamental_names = df2['Ticker Symbol'].unique()

In [None]:
set1 = set(sp500_names)
set2 = set(fundamental_names)
diff_names = set1.symmetric_difference(set2)
diff_names

In [None]:
#SP500 with dropped names
df4 = df3.drop(diff_names,axis = 0)
df4 = df4.drop(['per_change'],axis = 1)

In [None]:
df2.index = df2['Ticker Symbol']
df2.drop(['Ticker Symbol'], axis = 1)

In [None]:
df2 = df2.drop(['Period Ending'],axis = 1)

In [None]:
df4

In [None]:
#Information Technology only
df5 = df4[df4['Sector'] == 'Information Technology']
sec_only_name = df5.index.unique()
len(sec_only_name)

In [None]:
#Dropping those that are not part of Information Technology Sector
set3 = set(sec_only_name)
set4 = set(df2.index.unique())
diff_name_2 = set3.symmetric_difference(set4)
len(diff_name_2)

In [None]:
df6 = df2
df6 = df6.drop(diff_name_2,axis = 0)

In [None]:
#Size matches with sec_only_name
len(df6.index.unique())

In [None]:
df6

In [None]:
#Swapping name and date for index
df5['name'] = df5.index
df5.index = df5['date']
df5 = df5.drop(['date'],axis = 1)
df5

In [None]:
#Need to calculate % PR for fundamental data of 2014, 2015
#See if fundamental data affects PR of next years.
for_2015 = df5['2015']
for_2016 = df5['2016']
PR_2015 = {}
PR_2016 = {}

for name in sec_only_name:
    #Just making sure all names go through
    print(name)
    #For 2015
    first_close = for_2015[for_2015['name'] == name].close[0]
    last_close = for_2015[for_2015['name'] == name].close[-1]
    PR_2015[name] = (last_close - first_close)/first_close
    #For 2016
    first_close = for_2016[for_2016['name'] == name].close[0]
    last_close = for_2016[for_2016['name'] == name].close[-1]
    PR_2016[name] = (last_close - first_close)/first_close

In [None]:
#Checking the range of Year's for fundamental data
dates = df6['For Year'].unique()
dates

In [None]:
#df7 drops 2012, 2013, 2016
df7 = df6
df7 = df7.rename(index=str, columns={"For Year" : "Year"})
df7 = df7[df7.Year != 2012]
df7 = df7[df7.Year != 2013]
df7 = df7[df7.Year != 2016]
#df7

In [None]:
df7

In [None]:
df_PR2015 = pd.DataFrame(list(PR_2015.items()))
df_PR2015 = df_PR2015.rename(index=str, columns={0 : "Ticker Symbol", 1 : "Percent_Return"})
df_PR2015['Year'] = 2014.0
df_PR2015

In [None]:
df_PR2016 = pd.DataFrame(list(PR_2016.items()))
df_PR2016 = df_PR2016.rename(index=str, columns={0 : "Ticker Symbol", 1 : "Percent_Return"})
df_PR2016['Year'] = 2015.0
df_PR2016

In [None]:
combined = pd.concat([df_PR2015,df_PR2016], axis = 0)
combined = combined.sort_values('Ticker Symbol')
combined = combined.rename(index = str, columns = {"Percent_Return" : "PR_NXT_YR"})
combined.index = combined['Ticker Symbol']
#combined = combined.drop(['Ticker Symbol'], axis = 1)
#combined
#merge df7 into combined on key = Ticker Symbol and Year
df8 = pd.merge(combined, df7, how = 'left', on = ['Ticker Symbol', 'Year'])
#Drop the nulls
df8 = df8.dropna()
df8 = df8.reset_index(drop = True)
df8

In [None]:
#Check if there's still null
#df8.isnull().sum() #NONE
#need to drop some more columns that won't be nessecary anymore
list(df8.columns)
PR_NXT_YR = df8['PR_NXT_YR']
df8 = df8.drop(['Ticker Symbol','Year', 'PR_NXT_YR'], axis = 1)
#Yes I did all that then drop the 'PR_NXT_YR'? I did that to align all data correctly

In [None]:
Pos_Or_Neg = []
for num in PR_NXT_YR:
    if num > 0:
        Pos_Or_Neg.append(1)
    elif num < 0:
        Pos_Or_Neg.append(0)
Pos_Or_Neg = pd.DataFrame(Pos_Or_Neg)
df8['Outcome'] = Pos_Or_Neg
df8

In [None]:
print(len(df8.columns))

DF8 is our final data we can use for our NN.
DF8 consist of :
- companies from the "Information Technology" sector
- Price Return (in %) of all the companies for year 2015 and 2016.
- Fundamental Data Represents fundamental data of all companies with respect to the extracted sector for end of 2014, and 2015

# Now I will use Marin's implementation for NN to test whether we can predict a positive (1) or negative (0) return using these fundamental data.

In [None]:
import sklearn.linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

In [None]:
def model_inputs(df, column):
    '''
    x value will be a 2d array consisting of all columns except "column"
    y value will consist of # of different variables (outcomes), in our case it'll be 0 or 1.
    '''
    temp_df = df.drop(column, axis=1)
    x = temp_df.values
    x = x.astype(np.float32)
    
    y = df[column].values.astype(np.float32)
    y = y.reshape(-1, 1)
    y = to_categorical(y)
    
    return x, y

In [None]:
X, y = model_inputs(df8, 'Outcome')
#Create test and train data. Random State = 42 because 42 is always the answer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#normalizating data L2 is default.
X_train = preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)

In [None]:
#sequential meaning nodes are connected in 
model = Sequential()

#hidden layer 1, input dimensionality
model.add(Dense(128, input_dim=X.shape[1], activation='relu'))

#hidden layer 2
model.add(Dense(8, activation='relu'))

#output layer
model.add(Dense(y.shape[1], activation='sigmoid'))

#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

#fit model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100)

In [None]:
model_pred = model.predict(X_test)
model_pred = np.argmax(model_pred, axis=1)
y_test_model = np.argmax(y_test, axis=1)

accuracy_score = metrics.accuracy_score(y_test_model, model_pred)
print(accuracy_score)

#Max = .6154
#Mode = .5769
#Min = .5