In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import re
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus.reader.wordnet import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
worddict = set(nltk.corpus.words.words())
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/utkarshchandra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/utkarshchandra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/utkarshchandra/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/utkarshchandra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/utkarshchandra/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/utkarshchandra/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [17]:
df = pd.read_csv('usbank.csv')

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,User,User_statuses_count,user_followers,fav_count,User_location,Tweets
0,0,#NeverOutOfTown,4253,563,0,"Any City, USA",RT @BOAStadiumWx: Bank of America Stadium at s...
1,1,DEE,22221,2627,0,Virginia! USA,"RT @CIG_KingJames: Dear God, \nPlease Save Ame..."
2,2,BigJayy üòé,126088,2300,0,2Ô∏è‚É£5Ô∏è‚É£2Ô∏è‚É£,RT @El_Liaison: Wonder how electric Bank Of Am...
3,3,Always a Trumpster!,92483,3706,0,,"RT @CIG_KingJames: Dear God, \nPlease Save Ame..."
4,4,Whitney Hakim,1818,161,0,,@Injustices4All @rossyrosay @BankofAmerica @of...


In [19]:
wn = nltk.WordNetLemmatizer()

In [20]:
def preprocessing(text):
    wordset_n = set(wn.lemmatize(w, wordnet.NOUN) for w in word_tokenize(text.lower().strip()))
    wordset_v = set(wn.lemmatize(w, wordnet.VERB) for w in wordset_n)
    wordset = set(wn.lemmatize(w, wordnet.ADJ) for w in wordset_v)
    wordset = wordset & worddict
    return ' '.join(list(wordset))

In [21]:
df['text']=df['Tweets'].apply(preprocessing)

In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,User,User_statuses_count,user_followers,fav_count,User_location,Tweets,text
0,0,#NeverOutOfTown,4253,563,0,"Any City, USA",RT @BOAStadiumWx: Bank of America Stadium at s...,at it bank and stadium sunset of
1,1,DEE,22221,2627,0,Virginia! USA,"RT @CIG_KingJames: Dear God, \nPlease Save Ame...",save the get god central please bank dear worl...
2,2,BigJayy üòé,126088,2300,0,2Ô∏è‚É£5Ô∏è‚É£2Ô∏è‚É£,RT @El_Liaison: Wonder how electric Bank Of Am...,how happen be wonder moment when will bank sta...
3,3,Always a Trumpster!,92483,3706,0,,"RT @CIG_KingJames: Dear God, \nPlease Save Ame...",save the get god central please bank dear worl...
4,4,Whitney Hakim,1818,161,0,,@Injustices4All @rossyrosay @BankofAmerica @of...,get end elsewhere i give up finance like


In [23]:
sid = SentimentIntensityAnalyzer()

In [24]:
df['scores'] = df['text'].apply(lambda text: sid.polarity_scores(text))
df.head()

Unnamed: 0.1,Unnamed: 0,User,User_statuses_count,user_followers,fav_count,User_location,Tweets,text,scores
0,0,#NeverOutOfTown,4253,563,0,"Any City, USA",RT @BOAStadiumWx: Bank of America Stadium at s...,at it bank and stadium sunset of,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,1,DEE,22221,2627,0,Virginia! USA,"RT @CIG_KingJames: Dear God, \nPlease Save Ame...",save the get god central please bank dear worl...,"{'neg': 0.186, 'neu': 0.381, 'pos': 0.432, 'co..."
2,2,BigJayy üòé,126088,2300,0,2Ô∏è‚É£5Ô∏è‚É£2Ô∏è‚É£,RT @El_Liaison: Wonder how electric Bank Of Am...,how happen be wonder moment when will bank sta...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,3,Always a Trumpster!,92483,3706,0,,"RT @CIG_KingJames: Dear God, \nPlease Save Ame...",save the get god central please bank dear worl...,"{'neg': 0.186, 'neu': 0.381, 'pos': 0.432, 'co..."
4,4,Whitney Hakim,1818,161,0,,@Injustices4All @rossyrosay @BankofAmerica @of...,get end elsewhere i give up finance like,"{'neg': 0.0, 'neu': 0.706, 'pos': 0.294, 'comp..."


In [25]:
df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['sentiment_type']=''
df.loc[df.compound>0,'sentiment_type']='POSITIVE'
df.loc[df.compound==0,'sentiment_type']='NEUTRAL'
df.loc[df.compound<0,'sentiment_type']='NEGATIVE'

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,User,User_statuses_count,user_followers,fav_count,User_location,Tweets,text,scores,compound,sentiment_type
0,0,#NeverOutOfTown,4253,563,0,"Any City, USA",RT @BOAStadiumWx: Bank of America Stadium at s...,at it bank and stadium sunset of,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL
1,1,DEE,22221,2627,0,Virginia! USA,"RT @CIG_KingJames: Dear God, \nPlease Save Ame...",save the get god central please bank dear worl...,"{'neg': 0.186, 'neu': 0.381, 'pos': 0.432, 'co...",0.5859,POSITIVE
2,2,BigJayy üòé,126088,2300,0,2Ô∏è‚É£5Ô∏è‚É£2Ô∏è‚É£,RT @El_Liaison: Wonder how electric Bank Of Am...,how happen be wonder moment when will bank sta...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL
3,3,Always a Trumpster!,92483,3706,0,,"RT @CIG_KingJames: Dear God, \nPlease Save Ame...",save the get god central please bank dear worl...,"{'neg': 0.186, 'neu': 0.381, 'pos': 0.432, 'co...",0.5859,POSITIVE
4,4,Whitney Hakim,1818,161,0,,@Injustices4All @rossyrosay @BankofAmerica @of...,get end elsewhere i give up finance like,"{'neg': 0.0, 'neu': 0.706, 'pos': 0.294, 'comp...",0.3612,POSITIVE


In [27]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment_type'], test_size=0.3, random_state=1)

In [28]:
count_vect = CountVectorizer()
tf_transformer = TfidfTransformer(use_idf=False) # just use tf, no idf used

# convert the text list to tfidf form matrix
x_train_counts = count_vect.fit_transform(x_train)
x_train_tf = tf_transformer.fit_transform(x_train_counts)
y_train = np.array(y_train)

clf = MultinomialNB(1.0, True, None)
clf.fit(x_train_tf, y_train) # train the classifier

# convert list to matrix
x_pre_counts = count_vect.transform(x_test)
x_pre_tf = tf_transformer.transform(x_pre_counts)

predicted = clf.predict(x_pre_tf)



In [29]:
metrics.accuracy_score(list(y_test), predicted)

0.8011695906432749

In [30]:
cm = confusion_matrix(list(y_test), predicted)
cm

array([[ 34,  16,  21],
       [  0, 114,  16],
       [  0,  15, 126]])

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [32]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(x_train_tf, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.890399,0.0545,0.030655,0.003432,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.8125,0.71875,0.798742,0.811321,0.792453,0.786753,0.034834,1
11,0.753894,0.012022,0.020133,0.001543,,300,"{'max_depth': None, 'n_estimators': 300}",0.8375,0.7375,0.805031,0.767296,0.761006,0.781667,0.035348,2
4,0.416667,0.037072,0.015088,0.002497,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.8125,0.7125,0.811321,0.798742,0.767296,0.780472,0.037696,3
5,0.857295,0.052611,0.031306,0.003944,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.8125,0.725,0.786164,0.811321,0.767296,0.780456,0.032436,4
10,0.423771,0.009096,0.017005,0.006481,,150,"{'max_depth': None, 'n_estimators': 150}",0.8125,0.73125,0.792453,0.798742,0.767296,0.780448,0.028634,5


In [33]:
clf = RandomForestClassifier(n_estimators=150, max_depth=60, random_state=0)
clf.fit(x_train_tf, y_train)
predicted = clf.predict(x_pre_tf)
metrics.accuracy_score(list(y_test), predicted)

0.8421052631578947

In [34]:
cm = confusion_matrix(list(y_test), predicted)
cm

array([[ 47,  16,   8],
       [  1, 111,  18],
       [  0,  11, 130]])