In [1]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
import numpy as np
import pandas as pd
import psycopg2
from pathlib import Path
from collections import Counter
from config import password

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [11]:
from sqlalchemy import create_engine
engine = create_engine(f'postgresql://postgres:{password}@stock-sentiment.cx2wqgu4uxsm.us-east-1.rds.amazonaws.com:5432/Stock-data')

In [16]:
wsb_comments_df = pd.read_sql('select * from wsb_comments', con=engine)
wsb_comments_df.shape

(0, 5)

In [15]:
comments_df = pd.read_sql('select * from comments', con=engine)
comments_df.shape

(4295, 5)

In [17]:
comments_df.head()

Unnamed: 0,index,cmnt_date,user_name,subreddit,cmnt_text
0,0,2018-01-02,MartyMoho,wallstreetbets,"I hadn’t read AAPL was working on a fab, do yo..."
1,1,2018-01-02,MartyMoho,wallstreetbets,AAPL is not a semi company even though it does...
2,2,2018-01-02,tempedrew,wallstreetbets,"To clarify, BABA is my favorite Chinese compan..."
3,3,2018-01-02,TransgenderSunrise,wallstreetbets,$AAPL
4,4,2018-01-02,the_ultimate_trader,wallstreetbets,"I don't think this would ever happen, Apple ha..."


In [4]:
columns = [
    "date", "subreddit", "username", "ticker", "text", "sentiment"
   ]

target = ["sentiment"]

In [7]:
# Load the data
file_path = Path('Resources/dummy_nlp_data.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,date,subreddit,username,ticker,text,sentiment
0,2020-07-06,securityanalysis,user0,$FB,$FB is going up,0
1,2020-09-04,stockmarket,user1,$FB,$FB is going up,1
2,2020-05-27,robinhood,user2,$NFLX,$NFLX to the moon,1
3,2020-07-21,stockmarket,user3,$GOOG,$GOOG is amazing value,0
4,2020-09-21,wallstreetbets,user4,$FB,$FB is going down,1


In [8]:
# Create our features
X = df.drop(columns='sentiment')
X = pd.get_dummies(X)
# Create our target
y = df.loc[:, target].copy()

In [9]:
X.columns

Index(['date_2020-01-10', 'date_2020-01-15', 'date_2020-01-17',
       'date_2020-02-01', 'date_2020-02-09', 'date_2020-02-15',
       'date_2020-02-19', 'date_2020-02-25', 'date_2020-03-01',
       'date_2020-03-02',
       ...
       'text_$MSFT is going down', 'text_$MSFT is going up',
       'text_$MSFT is overpriced', 'text_$MSFT to 0',
       'text_$NFLX is amazing value', 'text_$NFLX is going down',
       'text_$NFLX is going up', 'text_$NFLX is overpriced', 'text_$NFLX to 0',
       'text_$NFLX to the moon'],
      dtype='object', length=129)

In [10]:
X.shape

(50, 129)

In [11]:
X.describe()

Unnamed: 0,date_2020-01-10,date_2020-01-15,date_2020-01-17,date_2020-02-01,date_2020-02-09,date_2020-02-15,date_2020-02-19,date_2020-02-25,date_2020-03-01,date_2020-03-02,...,text_$MSFT is going down,text_$MSFT is going up,text_$MSFT is overpriced,text_$MSFT to 0,text_$NFLX is amazing value,text_$NFLX is going down,text_$NFLX is going up,text_$NFLX is overpriced,text_$NFLX to 0,text_$NFLX to the moon
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,...,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.02,0.04,0.04,0.04,0.02,0.02,0.02,0.02,0.02,0.02,...,0.06,0.02,0.02,0.06,0.1,0.02,0.02,0.02,0.02,0.06
std,0.141421,0.197949,0.197949,0.197949,0.141421,0.141421,0.141421,0.141421,0.141421,0.141421,...,0.239898,0.141421,0.141421,0.239898,0.303046,0.141421,0.141421,0.141421,0.141421,0.239898
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# Check the balance of our target values
y['sentiment'].value_counts()

0    31
1    19
Name: sentiment, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'sentiment': 1})

In [15]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.45

In [17]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[4, 4],
       [3, 2]], dtype=int64)

In [18]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.57      0.50      0.40      0.53      0.45      0.20         8
          1       0.33      0.40      0.50      0.36      0.45      0.20         5

avg / total       0.48      0.46      0.44      0.47      0.45      0.20        13



In [19]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
from collections import Counter
Counter(y_resampled)

Counter({'sentiment': 1})

In [20]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [21]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.4125

In [22]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[5, 3],
       [4, 1]], dtype=int64)

In [23]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.56      0.62      0.20      0.59      0.35      0.13         8
          1       0.25      0.20      0.62      0.22      0.35      0.12         5

avg / total       0.44      0.46      0.36      0.45      0.35      0.13        13



In [24]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
from collections import Counter
Counter(y_resampled)

Counter({'sentiment': 1})

In [25]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [26]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.55

In [27]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[4, 4],
       [2, 3]], dtype=int64)

In [28]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.67      0.50      0.60      0.57      0.55      0.30         8
          1       0.43      0.60      0.50      0.50      0.55      0.30         5

avg / total       0.58      0.54      0.56      0.54      0.55      0.30        13



In [29]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
from collections import Counter
Counter(y_resampled)

Counter({'sentiment': 1})

In [30]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [31]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5

In [32]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[8, 0],
       [5, 0]], dtype=int64)

In [33]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.62      1.00      0.00      0.76      0.00      0.00         8
          1       0.00      0.00      1.00      0.00      0.00      0.00         5

avg / total       0.38      0.62      0.38      0.47      0.00      0.00        13



In [34]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [35]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.65

In [36]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[4, 4],
       [1, 4]], dtype=int64)

In [37]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.50      0.80      0.62      0.63      0.39         8
          1       0.50      0.80      0.50      0.62      0.63      0.41         5

avg / total       0.68      0.62      0.68      0.62      0.63      0.40        13



In [38]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
cols = X.columns
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df.head()
feature_importances_df.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
99,ticker_$AAPL,0.032884
114,text_$GOOG is amazing value,0.025451
103,ticker_$NFLX,0.023444
100,ticker_$FB,0.022997
47,subreddit_stocks,0.021928
...,...,...
81,username_user38,0.000000
82,username_user39,0.000000
84,username_user40,0.000000
92,username_user48,0.000000


In [39]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [40]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.75

In [41]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[4, 4],
       [0, 5]], dtype=int64)

In [42]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.50      1.00      0.67      0.71      0.48         8
          1       0.56      1.00      0.50      0.71      0.71      0.53         5

avg / total       0.83      0.69      0.81      0.68      0.71      0.49        13

