## Upload data

In [1]:
import numpy as np
import pandas as pd
import string
import re
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

sm_data = pd.read_csv('stock_data.csv')
sm_data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


## Create function to clean text (remove punctuation, tokenize, remove stopwords, and stem) 

In [2]:
def clean_text(text):
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    tokens = re.split('\w+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

## Vectorize with TfidfVectorizer and CountVectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vect.fit_transform(sm_data['Text'])
x_tfidf_df = pd.DataFrame(x_tfidf.toarray())
x_tfidf_df.columns = tfidf_vect.get_feature_names()

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
x_count = count_vect.fit_transform(sm_data['Text'])
x_count_df = pd.DataFrame(x_count.toarray())
x_count_df.columns = count_vect.get_feature_names()

## Split and train data

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_tfidf_df, sm_data['Sentiment'], test_size=0.2)

## Explore machine learning models

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [6]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} --- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3),
    round(pred_time, 3),
    round(precision, 3), 
    round(recall, 3), 
    round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 0.254 / Predict time: 0.108 --- Precision: 0.644 / Recall: 0.872 / Accuracy: 0.621


In [7]:
gb = GradientBoostingClassifier(n_estimators=50, max_depth=3, learning_rate = 0.1)

start = time.time()
gb_model = gb.fit(X_train, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} --- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3),
    round(pred_time, 3),
    round(precision, 3), 
    round(recall, 3), 
    round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 0.185 / Predict time: 0.003 --- Precision: 0.645 / Recall: 0.954 / Accuracy: 0.645
