In [175]:
"""
Here we are looking for a trading strategy using the AAII sentiment survey results. 

We used Machine Learning to analyze data from the AAII website.

It is weekly data of the survey of its members. It asks its members how
they feel about the stock market. For each week they get a % bullish, % neutral, % bearish. 

"""
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report



%matplotlib inline

In [176]:
#Read in AAII Sentiment data from csv

csvpath = Path(f"C:/Users/costa/Desktop/byte/rice/ru-hou-fin-pt-03-2020-u-c/Project_2_dream_team/sentiment.csv",
                  index_col="date", infer_datetime_format=True, parse_dates=True)
stock_data_df = pd.read_csv(csvpath)
stock_data_df['Date'] = pd.to_datetime(stock_data_df['Date'] )
#stock_data_df.set_index('Date',inplace=True) 

#Change Monthly Close column to 'Close'
#stock_data_df.rename(columns={'SP500 Weekly Close': 'Close'},inplace=True)
#stock_data_df.drop(["Unnamed: 14"], axis=1, inplace=True)

stock_data_df.head(20)


Unnamed: 0,Date,Bullish,Neutral,Bearish,Bull 8 wk mov avg,Bull 16 wk mov avg,Bear 8 wk mov avg,Bear 16 wk mov avg,Neutral 8 wk mov avg,Neutral 16 wk mov avg,Bull-Bear Spread,Close,Close in 4 wks,Close in 8 wks
0,NaT,,,,,,,,,,,,,
1,1987-06-26,,,,,,,,,,,,,
2,1987-07-17,,,,,,,,,,,314.59,333.99,321.98
3,1987-07-24,0.36,0.5,0.14,,,,,,,0.22,309.27,335.9,314.86
4,1987-07-31,0.26,0.48,0.26,,,,,,,0.0,318.66,323.04,320.16
5,1987-08-07,0.56,0.15,0.29,,,,,,,0.27,323.0,316.7,328.07
6,1987-08-14,0.45,0.35,0.2,,,,,,,0.25,333.99,321.98,311.07
7,1987-08-21,0.66,0.28,0.06,,,,,,,0.6,335.9,314.86,282.7
8,1987-08-28,0.52,0.18,0.3,,,,,,,0.22,323.04,320.16,248.22
9,1987-09-04,0.42,0.17,0.41,,,,,,,0.01,316.7,328.07,251.79


In [177]:
#Clean data

#Sort earliest to latest.
stock_data_df.sort_index(inplace=True, ascending=True)

# Drop nulls
stock_data_df.dropna(inplace=True)

# drop duplicates
stock_data_df.drop_duplicates(inplace=True)

#count nulls 
stock_data_df.isnull().sum()

#Create up,down,neutral signals in a new column

stock_data_df['Signal 4 wks'] = 0

stock_data_df['4 wk chg'] = (stock_data_df['Close in 4 wks']- stock_data_df['Close'])/stock_data_df['Close']

#Set signal 4 wks to UP if the market is up 3% or more, NEUTRAL if less than 3% but greater than 0%, 
# DOWN when less than zero

up_mask = (stock_data_df['4 wk chg'] > .03) 
stock_data_df.loc[up_mask, 'Signal 4 wks'] = "UP"

# if stock_data_df["Signal 4 wks"].item != 'UP':
down_mask = (stock_data_df['4 wk chg'] < 0)
stock_data_df.loc[down_mask, 'Signal 4 wks'] = "DOWN"

stock_data_df['Signal 4 wks'] = stock_data_df['Signal 4 wks'].replace([0], 'NEUTRAL')


# if stock_data_df["Signal 4 wks"].item != 'UP':
#     if stock_data_df["Signal 4 wks"].item != 'DOWN':
#         neutral_mask = (stock_data_df['4 wk chg'].item >0 and stock_data_df['4 wk chg'].item <.03)
#         stock_data_df.loc[neutral_mask,'Signal 4 wks'] = 'NEUTRAl'
        
        
# if stock_data_df["Signal 4 wks"].item != "UP":
#     if stock_data_df["Signal 4 wks"].item != "DOWN":
#         neutral_mask = (stock_data_df['4 wk chg']>0)
#         stock_data_df.loc[neutral_mask,'Signal 4 wks'] = 'NEUTRAl'
    
    
# if stock_data_df['Signal 4 wks'].item != "UP" or stock_data_df['Signal 4 wks'].item != "DOWN":
#     neutral_mask = (stock_data_df['4 wk chg'].item>0 and stock_data_df['4 wk chg'].item <.03)
#     stock_data_df.loc[neutral_mask, 'Signal 4 wks'] = "NEUTRAL"


# for index, row in stock_data_df.iterrows():
  
#     if stock_data_df['4 wk chg'].03:
#         stock_data_df['Signal 4 wks'] = 'UP'
        
#     elif stock_data_df['4 wk chg'].item !=.03:   
#         stock_data_df['Signal 4 wks'] = 'NEUTRAL'
#     else:
#         stock_data_df['Signal 4 wks'] = 'DOWN'

# target = ['Close']
stock_data_df.set_index('Date',inplace=True)
stock_data_df.head(50)



Unnamed: 0_level_0,Bullish,Neutral,Bearish,Bull 8 wk mov avg,Bull 16 wk mov avg,Bear 8 wk mov avg,Bear 16 wk mov avg,Neutral 8 wk mov avg,Neutral 16 wk mov avg,Bull-Bear Spread,Close,Close in 4 wks,Close in 8 wks,Signal 4 wks,4 wk chg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1987-11-06,0.47,0.31,0.22,0.46,0.29,0.26,0.25,0.28,0.29,0.25,250.41,223.92,247.08,DOWN,-0.105787
1987-11-13,0.31,0.41,0.28,0.43,0.28,0.28,0.26,0.3,0.28,0.03,245.64,235.32,243.4,DOWN,-0.042013
1987-11-20,0.39,0.42,0.19,0.41,0.28,0.26,0.25,0.33,0.28,0.2,242.0,249.16,252.05,NEUTRAL,0.029587
1987-11-27,0.27,0.46,0.27,0.39,0.3,0.26,0.25,0.35,0.3,0.0,240.34,252.02,246.5,UP,0.048598
1987-12-04,0.25,0.33,0.42,0.36,0.3,0.29,0.27,0.35,0.3,-0.17,223.92,247.08,257.07,UP,0.10343
1987-12-11,0.23,0.45,0.32,0.34,0.31,0.29,0.28,0.36,0.31,-0.09,235.32,243.4,250.96,UP,0.034336
1987-12-18,0.33,0.4,0.27,0.34,0.32,0.29,0.28,0.38,0.32,0.06,249.16,252.05,257.63,NEUTRAL,0.011599
1987-12-24,0.28,0.52,0.2,0.32,0.34,0.27,0.27,0.41,0.34,0.08,252.02,246.5,261.61,DOWN,-0.021903
1987-12-31,0.25,0.49,0.26,0.29,0.36,0.28,0.27,0.44,0.36,-0.01,247.08,257.07,262.46,UP,0.040432
1988-01-08,0.34,0.46,0.2,0.29,0.37,0.27,0.27,0.44,0.37,0.14,243.4,250.96,267.3,UP,0.03106


In [178]:
# Create our features
from sklearn.preprocessing import LabelEncoder, StandardScaler

y = stock_data_df['Signal 4 wks']
col = ['Signal 4 wks', '4 wk chg', 'Close in 8 wks', 'Close in 4 wks']
X = stock_data_df.drop(columns= col, axis = 1)

X.head()


Unnamed: 0_level_0,Bullish,Neutral,Bearish,Bull 8 wk mov avg,Bull 16 wk mov avg,Bear 8 wk mov avg,Bear 16 wk mov avg,Neutral 8 wk mov avg,Neutral 16 wk mov avg,Bull-Bear Spread,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1987-11-06,0.47,0.31,0.22,0.46,0.29,0.26,0.25,0.28,0.29,0.25,250.41
1987-11-13,0.31,0.41,0.28,0.43,0.28,0.28,0.26,0.3,0.28,0.03,245.64
1987-11-20,0.39,0.42,0.19,0.41,0.28,0.26,0.25,0.33,0.28,0.2,242.0
1987-11-27,0.27,0.46,0.27,0.39,0.3,0.26,0.25,0.35,0.3,0.0,240.34
1987-12-04,0.25,0.33,0.42,0.36,0.3,0.29,0.27,0.35,0.3,-0.17,223.92


In [179]:
X.describe()


Unnamed: 0,Bullish,Neutral,Bearish,Bull 8 wk mov avg,Bull 16 wk mov avg,Bear 8 wk mov avg,Bear 16 wk mov avg,Neutral 8 wk mov avg,Neutral 16 wk mov avg,Bull-Bear Spread,Close
count,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0
mean,0.379231,0.315164,0.305657,0.379789,0.315258,0.305428,0.304853,0.315381,0.315258,0.07368,1230.434941
std,0.100484,0.084425,0.095443,0.077578,0.066789,0.073986,0.066769,0.070898,0.066789,0.176759,724.885964
min,0.12,0.08,0.07,0.18,0.17,0.15,0.17,0.16,0.17,-0.54,223.92
25%,0.31,0.25,0.24,0.32,0.27,0.25,0.26,0.26,0.27,-0.05,616.2375
50%,0.37,0.32,0.29,0.38,0.32,0.29,0.29,0.32,0.32,0.08,1164.32
75%,0.44,0.37,0.36,0.43,0.36,0.35,0.34,0.36,0.36,0.19,1473.3775
max,0.75,0.62,0.7,0.64,0.5,0.56,0.54,0.53,0.5,0.63,3386.15


In [180]:
X.shape

(1704, 11)

In [181]:
y.value_counts()

NEUTRAL    638
DOWN       637
UP         429
Name: Signal 4 wks, dtype: int64

In [182]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=85)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=85)

# Creating StandardScaler instance
scaler = StandardScaler()
#scaler=MinMaxScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [183]:
# Resample the training data with the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model = rf_model.fit(X_train_scaled, y_train)
predictions = rf_model.predict(X_test_scaled)

In [184]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test_scaled)
print(balanced_accuracy_score(y_test, predictions))
acc_score = accuracy_score(y_test, predictions)
print(acc_score)

0.4946248212151492
0.5117370892018779


In [185]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2", "Actual 3"], columns=["Predicted 1", "Predicted 2", "Predicted 3"]
)

In [186]:
# Print the imbalanced classification report
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Actual 1,95,52,12
Actual 2,53,85,22
Actual 3,26,43,38


Accuracy Score : 0.5117370892018779
Classification Report
              precision    recall  f1-score   support

        DOWN       0.55      0.60      0.57       159
     NEUTRAL       0.47      0.53      0.50       160
          UP       0.53      0.36      0.42       107

    accuracy                           0.51       426
   macro avg       0.52      0.49      0.50       426
weighted avg       0.51      0.51      0.51       426



In [187]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted


[(0.154220621447714, 'Close'),
 (0.09330395948431254, 'Bull 8 wk mov avg'),
 (0.09197373246957069, 'Bull-Bear Spread'),
 (0.09166591511535763, 'Bear 16 wk mov avg'),
 (0.09032279947737716, 'Bear 8 wk mov avg'),
 (0.0873270133414753, 'Neutral'),
 (0.08673017857113459, 'Bearish'),
 (0.08579059678357699, 'Neutral 8 wk mov avg'),
 (0.08416236152404202, 'Bullish'),
 (0.06770778622729018, 'Neutral 16 wk mov avg'),
 (0.06679503555814909, 'Bull 16 wk mov avg')]

In [188]:
# Train the Easy Ensemble Adaboost Classifier
from sklearn.ensemble import AdaBoostClassifier

ada_model = AdaBoostClassifier(n_estimators=500, random_state=78)
ada_model = ada_model.fit(X_train_scaled, y_train)
predictions = ada_model.predict(X_test_scaled)

In [189]:
# Calculated the balanced accuracy score
y_pred = ada_model.predict(X_test_scaled)
print(balanced_accuracy_score(y_test, predictions))

0.424750435941143


In [190]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[82, 45, 32],
       [52, 78, 30],
       [35, 43, 29]], dtype=int64)

In [191]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       DOWN       0.49      0.52      0.67      0.50      0.59      0.34       159
    NEUTRAL       0.47      0.49      0.67      0.48      0.57      0.32       160
         UP       0.32      0.27      0.81      0.29      0.47      0.21       107

avg / total       0.44      0.44      0.71      0.44      0.55      0.30       426

