# Preprocessing
## Using 2018 data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt


### Data cleaning

In [2]:
sp_df = pd.read_csv('s&p500_processed.csv', index_col=0)
news_df = pd.read_csv('news_sentiments.csv')


In [3]:
sp_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,up/down,true_value
0,1980-01-02,-1.157349,-1.064447,-1.065108,-1.066003,-0.960949,1,-1
1,1980-01-03,-1.157349,-1.066807,-1.067169,-1.066548,-0.955609,1,1
2,1980-01-04,-1.157349,-1.065803,-1.065311,-1.065236,-0.96175,1,1
3,1980-01-07,-1.157349,-1.06508,-1.06459,-1.064943,-0.958844,1,1
4,1980-01-08,-1.157349,-1.063584,-1.064093,-1.062783,-0.954035,1,1


In [4]:
news_df.head()

Unnamed: 0,date,News Sentiment
0,1980-01-01,-0.03926
1,1980-01-02,-0.109253
2,1980-01-03,-0.093304
3,1980-01-05,-0.07217
4,1980-01-06,-0.09514


In [5]:
#str date to datetime object
news_df['date']=news_df['date'].apply(lambda row: datetime.strptime(row, "%Y-%m-%d"))
#rename col
sp_df.rename(columns = {'Date':'date'}, inplace = True)
news_df.rename(columns = {'News Sentiment':'Sentiment'}, inplace = True)
#change type
sp_df['date'] = sp_df['date'].astype('datetime64[ns]')

In [6]:
news_df.dtypes

date         datetime64[ns]
Sentiment           float64
dtype: object

In [7]:
from sklearn.preprocessing import StandardScaler

#normalize data using sklearn
normalizer = StandardScaler()
df_dropped = news_df.drop('date', axis = 1)
normalizer_news_df = pd.DataFrame(normalizer.fit_transform(df_dropped), columns = df_dropped.columns)
normalizer_news_df.insert(loc = 0, column = 'date', value = news_df['date'])

In [8]:
sp_df.head()

Unnamed: 0,date,Open,High,Low,Close,Volume,up/down,true_value
0,1980-01-02,-1.157349,-1.064447,-1.065108,-1.066003,-0.960949,1,-1
1,1980-01-03,-1.157349,-1.066807,-1.067169,-1.066548,-0.955609,1,1
2,1980-01-04,-1.157349,-1.065803,-1.065311,-1.065236,-0.96175,1,1
3,1980-01-07,-1.157349,-1.06508,-1.06459,-1.064943,-0.958844,1,1
4,1980-01-08,-1.157349,-1.063584,-1.064093,-1.062783,-0.954035,1,1


In [9]:
normalizer_news_df.head()

Unnamed: 0,date,Sentiment
0,1980-01-01,-0.259934
1,1980-01-02,-0.632982
2,1980-01-03,-0.547975
3,1980-01-05,-0.435337
4,1980-01-06,-0.557763


In [10]:
#merge dataframes
df = pd.merge(sp_df, normalizer_news_df, on="date", how="left")
df = df[['date', 'Open', 'High', 'Low', 'Close', 'Volume', 'up/down', 'Sentiment','true_value']]
#drop NAN rows
df = df.dropna()

In [11]:
df

Unnamed: 0,date,Open,High,Low,Close,Volume,up/down,Sentiment,true_value
0,1980-01-02,-1.157349,-1.064447,-1.065108,-1.066003,-0.960949,1,-0.632982,-1
1,1980-01-03,-1.157349,-1.066807,-1.067169,-1.066548,-0.955609,1,-0.547975,1
3,1980-01-07,-1.157349,-1.065080,-1.064590,-1.064943,-0.958844,1,-0.766564,1
4,1980-01-08,-1.157349,-1.063584,-1.064093,-1.062783,-0.954035,1,-1.139702,1
5,1980-01-09,-1.157349,-1.061777,-1.061940,-1.062682,-0.947613,1,-1.267481,1
...,...,...,...,...,...,...,...,...,...
10689,2022-05-23,2.769766,2.824832,2.796829,2.837995,0.852602,1,-0.469044,-1
10690,2022-05-24,2.793333,2.798525,2.762400,2.805425,1.127906,-1,-0.533289,1
10691,2022-05-25,2.779956,2.842354,2.813063,2.843022,1.355428,1,-0.645402,1
10692,2022-05-26,2.835074,2.918473,2.873545,2.922868,1.160529,1,-0.683530,1


In [16]:
#save dataframe
df.to_csv('cleaned_data.csv')

In [12]:
# y = df["true_value"]
# X = df.iloc[:, 1:-1]

In [13]:
# #split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# scores = []

# #run logistic model
# log = LogisticRegression(random_state=42).fit(X_train, y_train)
# scores = np.append(scores,log.score(X_test, y_test))

# #run random forest
# ran = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
# scores = np.append(scores,ran.score(X_test, y_test))

# #run decision tree
# tree = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
# scores = np.append(scores,tree.score(X_test, y_test))

# #run SVM 
# svm = SVC(gamma='auto').fit(X_train, y_train)
# scores = np.append(scores,svm.score(X_test, y_test))

# #run KNN
# neighbor = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
# scores = np.append(scores,neighbor.score(X_test, y_test))


In [14]:
# plt.figure()
# plt.bar(range(5), scores)
# labels = ['Logistic','Random Forrest','Decision Tree','SVM','KNN']
# plt.xticks(range(5),labels,rotation='vertical')
# plt.title('Models')
# # plt.xlabel('False Positive Rate')
# # plt.ylabel('True Positive Rate')

In [15]:
# from sklearn.metrics import roc_auc_score, roc_curve
# import matplotlib.pyplot as plt

# prob = clf.predict_proba(X_test)
# prob = prob[:,1]

# roc_auc = roc_auc_score(y_test, prob)
# fpr, tpr, thresholds = roc_curve(y_test, prob)

# # plotting the ROC curve
# plt.figure(dpi=100)
# plt.plot(fpr, tpr)
# plt.title('ROC curve')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')

# print('Area under the Receiver Operating Characteristic curve:', 
#       roc_auc)