In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb

# Loading Data and Training Models

1. Linear Regression
2. Random Forest Regression
3. XG Boost Regressor
4. Random Forest Classification
5. XG Boost Classifier

In [2]:
text_path = '/Users/mackdelany/Documents/DSR Notebooks/NLP-stock-price-prediction/data/interim/text_features.csv'
labels_path = '/Users/mackdelany/Documents/DSR Notebooks/NLP-stock-price-prediction/data/interim/prototype_labels.csv'

text = pd.read_csv(text_path)
labels = pd.read_csv(labels_path)                   

In [5]:
data = text.merge(labels, how='inner', left_on='Date', right_on='Date').drop(['Date'],axis=1)
data = data.dropna() # dropped 77 NA rows
data.head()

Unnamed: 0,00,000,000bpd,000ft,000km,000new,000s,000sq,000th,000usd,...,zuckerman,zug,zulu,zuma,zumar,zumas,zurich,zuyevo,zweimal,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.030702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
y = data['Label'].values
X = data.drop(['Label'], axis=1).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 0, shuffle=False)

In [None]:
"""## shift y from 'today' to 'tomorrow' 
y = y[1:]
X = X[:-1,:]"""

In [None]:
lr = LogisticRegression(random_state=0)
rf_classifier = RandomForestClassifier(100, verbose=1, n_jobs=-1)
xg_classifier = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [None]:
lr.fit(X, y)
rf_classifier.fit(X_train, y_train)
xg_classifier.fit(X_train, y_train)

In [None]:
X.shape

# 1. Logistic Classification

In [None]:
y_lr_pred_train = lr.predict(X_train)
y_lr_pred_test = lr.predict(X_test)

In [None]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_lr_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_lr_pred_test))

# 2. Random Forest Classification

In [None]:
y_rfc_pred_train = rf_classifier.predict(X_train)
y_rfc_pred_test = rf_classifier.predict(X_test)

In [None]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_rfc_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_rfc_pred_test))

# 3. XG Boost Classification

In [None]:
y_xgc_pred_train = xg_classifier.predict(X_train)
y_xgc_pred_test = xg_classifier.predict(X_test)

In [None]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_xgc_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_xgc_pred_test))

# 4. Ensemble Classification (Mode)

In [None]:
y_class_pred_ensemble_train = (y_lr_pred_train + y_rfc_pred_train + y_xgc_pred_train) // 3
y_class_pred_ensemble_test = (y_lr_pred_test + y_rfc_pred_test + y_xgc_pred_test) // 3

In [None]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_class_pred_ensemble_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_class_pred_ensemble_test))