In [1]:
import pandas as pd
from ast import literal_eval

#NLTK Models
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.metrics.scores import (precision, recall, f_measure)

#SKLearn models can be used in NLTK
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


In [2]:
df = pd.read_csv('Tweets_clean.csv')
df.head(1)

Unnamed: 0,Label,emoji,Cashtags,hashtag_list,clean_tweet
0,2,,"['$SPX', '$SPY', '$ES', '$IXIC', '$DJI', '$QQQ']","['Markets', 'Stocks']","['market', 'stock']"


# Pre-ML Processing

In [3]:
#some extra processing is required here to ensure the lexicon only uses the training data - not words only in the test data

In [4]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: literal_eval(x))

In [6]:
di = {1: "Positive", 2: "Neutral", 0: "Negative"}
df['Label_Map'] = df['Label'].map(di) # map numbers to labes which we can use to create the data format for algorithm

In [7]:
#function to create a dictionary of all words in the dictionary and true/false to check if there are in the tweet

def all_tokens_for_model(tokens,label,all_words):
    return [{word: (word in tokens) for word in all_words}, label]

In [8]:
len(df)

910

In [9]:
#randomise and split into training and test data

df = df.sample(frac=1).reset_index(drop=True) #shuffle dataset
df_train = df[:700]
df_test  = df[700:]

In [10]:
#we need to get all words in the training set to have the data in the format for the model

all_words = set(word for passage in df_train['clean_tweet'] for word in passage)

In [11]:
df_train['ML_tweet'] = df.apply(lambda x:  all_tokens_for_model(x.clean_tweet,x.Label_Map, all_words), axis=1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
df_test['ML_tweet'] = df.apply(lambda x:  all_tokens_for_model(x.clean_tweet,x.Label_Map, all_words), axis=1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


# Training Models

In [13]:
NB_clf = NaiveBayesClassifier.train(df_train['ML_tweet'])
classify.accuracy(NB_clf, df_test['ML_tweet'])

0.6

In [14]:
NB_clf.show_most_informative_features(5)

Most Informative Features
                   short = True           Negati : Neutra =     35.2 : 1.0
                 bearish = True           Negati : Neutra =     11.5 : 1.0
                   lower = True           Negati : Neutra =      9.4 : 1.0
                 billion = True           Negati : Neutra =      8.1 : 1.0
                   daily = True           Negati : Neutra =      7.3 : 1.0


In [15]:
MNB_clf = SklearnClassifier(MultinomialNB())
MNB_clf.train(df_train['ML_tweet'])
classify.accuracy(MNB_clf, df_test['ML_tweet'])

0.6190476190476191

In [16]:
BNB_clf = SklearnClassifier(BernoulliNB())
BNB_clf.train(df_train['ML_tweet'])
classify.accuracy(BNB_clf, df_test['ML_tweet'])

0.5428571428571428

In [17]:
LogReg_clf = SklearnClassifier(LogisticRegression(multi_class='multinomial', solver = 'lbfgs'))
LogReg_clf.train(df_train['ML_tweet'])
classify.accuracy(LogReg_clf, df_test['ML_tweet'])

0.6476190476190476

In [18]:
SGD_clf = SklearnClassifier(SGDClassifier(max_iter =1000, tol=.1))
SGD_clf.train(df_train['ML_tweet'])
classify.accuracy(SGD_clf, df_test['ML_tweet'])

0.6428571428571429

In [19]:
SVC_clf = SklearnClassifier(SVC(gamma='auto'))
SVC_clf.train(df_train['ML_tweet'])
classify.accuracy(SVC_clf, df_test['ML_tweet'])

0.4666666666666667

In [20]:
DT_clf = SklearnClassifier(DecisionTreeClassifier())
DT_clf.train(df_train['ML_tweet'])
classify.accuracy(DT_clf, df_test['ML_tweet'])

0.6190476190476191

# Scoring Models

In [21]:
def classify_data(classifier,data): #this outputs just a result, negative or positive
    data = data[0] # the data is currently a list containing a dictionary and the label. This just gets the dictionary
    return classifier.classify(data)

In [22]:
#test all models on the data to compare results

algos = [NB_clf, MNB_clf, BNB_clf, LogReg_clf, SGD_clf, SVC_clf]
for i in algos:
    df_test[i] = df_test['ML_tweet'].apply(lambda x: classify_data(i, x)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [23]:
#rename columns and save results
df_test.columns = ['Label', 'Emoji','Cashtags','Hashtags','Clean_Tweet','Label_Name','ML_tweet', 'NB', 'MNB', 'BNB','LogReg','SDG','SVC']
df_test.to_csv('testing.csv', index=False)

# Results dataframe

In [25]:
df_results = pd.DataFrame(columns=['Algorithm', 'Accuracy','Precision','Recall','F1_score'])

In [26]:
df_results['Algorithm'] = ['NB_clf', 'MNB_clf', 'BNB_clf', 'LogReg_clf', 'SGD_clf', 'SVC_clf']

In [27]:
a=[]
for i in algos:
    a.append(classify.accuracy(i, df_test['ML_tweet']))
df_results['Accuracy'] = a

In [28]:
Refset = set(df_test[df_test.Label_Name.eq('Positive')].index) #NLTK requires a 'set' for precision, recall and F1 scores

In [29]:
df_results['Precision'] =[
    precision(Refset,set(df_test[df_test.NB.eq('Positive')].index)),
    precision(Refset,set(df_test[df_test.MNB.eq('Positive')].index)),
    precision(Refset,set(df_test[df_test.BNB.eq('Positive')].index)),
    precision(Refset,set(df_test[df_test.LogReg.eq('Positive')].index)),
    precision(Refset,set(df_test[df_test.SDG.eq('Positive')].index)),
    precision(Refset,set(df_test[df_test.SVC.eq('Positive')].index))]

In [30]:
df_results['Recall'] =[
    recall(Refset,set(df_test[df_test.NB.eq('Positive')].index)),
    recall(Refset,set(df_test[df_test.MNB.eq('Positive')].index)),
    recall(Refset,set(df_test[df_test.BNB.eq('Positive')].index)),
    recall(Refset,set(df_test[df_test.LogReg.eq('Positive')].index)),
    recall(Refset,set(df_test[df_test.SDG.eq('Positive')].index)),
    recall(Refset,set(df_test[df_test.SVC.eq('Positive')].index))]

In [31]:
df_results['F1_score'] =[
    f_measure(Refset,set(df_test[df_test.NB.eq('Positive')].index), alpha=0.5),
    f_measure(Refset,set(df_test[df_test.MNB.eq('Positive')].index), alpha=0.5),
    f_measure(Refset,set(df_test[df_test.BNB.eq('Positive')].index), alpha=0.5),
    f_measure(Refset,set(df_test[df_test.LogReg.eq('Positive')].index), alpha=0.5),
    f_measure(Refset,set(df_test[df_test.SDG.eq('Positive')].index), alpha=0.5),
    f_measure(Refset,set(df_test[df_test.SVC.eq('Positive')].index), alpha=0.5)]

In [33]:
#output results
df_results 

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1_score
0,NB_clf,0.6,0.557143,0.493671,0.52349
1,MNB_clf,0.619048,0.561644,0.518987,0.539474
2,BNB_clf,0.542857,0.55814,0.303797,0.393443
3,LogReg_clf,0.647619,0.602564,0.594937,0.598726
4,SGD_clf,0.642857,0.626866,0.531646,0.575342
5,SVC_clf,0.466667,,0.0,


# Model for production

In [34]:
#Retrain best model with the full dataset

all_words = set(word for passage in df['clean_tweet'] for word in passage)
df['ML_tweet'] = df.apply(lambda x:  all_tokens_for_model(x.clean_tweet,x.Label_Map, all_words), axis=1) 

In [35]:
LogReg_clf = SklearnClassifier(LogisticRegression(multi_class='multinomial', solver = 'lbfgs'))
LogReg_clf.train(df['ML_tweet'])

<SklearnClassifier(LogisticRegression(multi_class='multinomial'))>

# Pickling models - 2 ways

In [37]:
import pickle
def pickler(model, name):
    pickle.dump(model, open(filepath+'\\'+name, 'wb'))
filepath = r'C:\Users\AdamShafi\Twitter Sentiment Analysis\Pickled Algos'
names = ('NB_clf.sav','MNB_clf.sav', 'BNB_clf.sav', 'LogReg_clf.sav', 'SGD_clf.sav', 'SVC_clf.sav')    

In [38]:
for i,j in zip(algos, names): #individual files
    pickler(i,j)

In [39]:
PIK = filepath+'\\'+'pickle.dat' #one pickle file

with open(PIK, "wb") as f:
    pickle.dump(algos, f)

In [41]:
#pickle corpus
with open('all_words.pkl', 'wb') as f:
    pickle.dump(all_words, f)

# References

In [42]:
#https://towardsdatascience.com/text-preprocessing-steps-and-universal-pipeline-94233cb6725a
#https://sebastianraschka.com/Articles/2014_multiprocessing.html
#https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
#https://stackoverflow.com/questions/20827741/nltk-naivebayesclassifier-training-for-sentiment-analysis