In [11]:
import pandas as pd
import numpy as np
import re
import nltk
from textblob import TextBlob
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve
import matplotlib.pyplot as plt

#Data preprocessing to removes @usernames,urls,symbols and makes all text lowercase
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я]+',' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()


#from google.colab import files   
#uploaded = files.upload()
#nltk.download('punkt')

#Global variable
batchsize = 100000 

For now I have tried using just the textblob labelled dataset as input. I tried out for smaller 5k dataset and then 1M rows. Even for 1M I was able to do without using partial_fit (batching) since we anyway use the counts of words which comes out to be a sparse matrix. Might have to do batching for the whole dataset though. Will have to replace the dataset with the actual voted labels from Li. 

In [12]:
#load data from csv/excel/json
#data=pd.read_csv('textblob_sample_5k.csv',encoding = 'unicode_escape')
#data=pd.read_csv('textblob_sentiment_1M.csv',encoding = "ISO-8859–1")
data = pd.read_csv('vote_all_no_conflict.csv')

data.head()

Unnamed: 0,Tweet ID,text,SentiStrength,Vader,Textblob,Vote
0,1278320583718178816,Business Group Complains Trump H-1B Reform Boosting U.S. Graduates. Big tech is whining that they will have to hire American instead of cheap fore...,Negative,Negative,Negative,Negative
1,1278346691171586049,"That moron trump vows to veto the Defense Bill if it includes renaming bases. So once again, military salaries and defense preparedness are second...",Neutral,Negative,Negative,Negative
2,1278368975689220097,@JoeBiden Debate President Trump. PROVE you don’t have dementia. #DementiaJoeCantDebate #JoeBidenScaredToDebate,Neutral,Neutral,Neutral,Neutral
3,1278368976960184320,@PamelaStovall6 @ChuckGrassley @realDonaldTrump Democrats are always saving @GOP dinosaurs like grassley after they fvck up,Neutral,Positive,Neutral,Neutral
4,1278368971314597890,@Jorgensen4POTUS @RealSpikeCohen Just found out about you and so far I love your policies and what you have to say. Ofc I still need to read more ...,Positive,Positive,Positive,Positive


In [13]:
#data["processed_text"] =  data['full_text'].apply(preprocess_text)
data["processed_text"] = data['text'].apply(preprocess_text)

In [14]:
pd.options.display.max_colwidth = 150
data.head()


Unnamed: 0,Tweet ID,text,SentiStrength,Vader,Textblob,Vote,processed_text
0,1278320583718178816,Business Group Complains Trump H-1B Reform Boosting U.S. Graduates. Big tech is whining that they will have to hire American instead of cheap fore...,Negative,Negative,Negative,Negative,business group complains trump h b reform boosting u s graduates big tech is whining that they will have to hire american instead of cheap foreign...
1,1278346691171586049,"That moron trump vows to veto the Defense Bill if it includes renaming bases. So once again, military salaries and defense preparedness are second...",Neutral,Negative,Negative,Negative,that moron trump vows to veto the defense bill if it includes renaming bases so once again military salaries and defense preparedness are second i...
2,1278368975689220097,@JoeBiden Debate President Trump. PROVE you don’t have dementia. #DementiaJoeCantDebate #JoeBidenScaredToDebate,Neutral,Neutral,Neutral,Neutral,debate president trump prove you don t have dementia dementiajoecantdebate joebidenscaredtodebate
3,1278368976960184320,@PamelaStovall6 @ChuckGrassley @realDonaldTrump Democrats are always saving @GOP dinosaurs like grassley after they fvck up,Neutral,Positive,Neutral,Neutral,democrats are always saving dinosaurs like grassley after they fvck up
4,1278368971314597890,@Jorgensen4POTUS @RealSpikeCohen Just found out about you and so far I love your policies and what you have to say. Ofc I still need to read more ...,Positive,Positive,Positive,Positive,just found out about you and so far i love your policies and what you have to say ofc i still need to read more about your stances and plans but i...


In [15]:
#Converting the text into tokens and getting the counts of each token based on the ngrams specified. 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
#cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = nltk.word_tokenize)
#cv=CountVectorizer(stop_words='english')
text_counts = cv.fit_transform(data['processed_text'])

In [16]:
text_counts
#cv.vocabulary_
# Import LabelEncoder
#from sklearn import preprocessing
#creating labelEncoder
#le = preprocessing.LabelEncoder()
#label = le.fit_transform(data['Sentiment'])

<2475775x247849 sparse matrix of type '<class 'numpy.int64'>'
	with 25453762 stored elements in Compressed Sparse Row format>

The three Naive bayes models with support for partial fit are MNB,GNB and BNB. So I have tried implementing the three of them.
Multinomial Naive Bayes(MNB) is the one thats particularly good with word counts, so we can try to tune that the most and probably get better accuracy.  
I have tried three apporaches:  
1. Using the text counts from Count Vectorizer as input
2. Using the counts from tf-idf vectorizer as input
3. converting the counts from part 1 as tf-idf frequency using transformer and feeding to model  
I think approach 2 and 3 are supposed to work equivantely but I noticed a slight increase in accuracy for the MNB model with approach 3 while using the smaller dataset. (couldn't try approch 3 for 1M dataset as memory required was too large)  


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#X_train, X_test, Y_train, Y_test = train_test_split(text_counts,data['Sentiment'], test_size=0.25, random_state=5)
X_train, X_test, Y_train, Y_test = train_test_split(text_counts,data['Vote'], test_size=0.25, random_state=5)

In [18]:
def batch_xy(input, output, batchsize):
  for i in range(0, input.shape[0], batchsize):
    yield input[i:i + batchsize, :], output[i:i + batchsize]

def batch_x(input, batchsize):
  for i in range(0, input.shape[0], batchsize):
    yield input[i:i + batchsize, :]

def report(name, target, pred):
  label = "[" + name + "] classification report:"
  print(label)
  print(classification_report(target, pred)) 

def roc(name, model, input, target):
  label = "[" + name + "] roc curve:"
  print(label)
  plt.figure()
  roc_plt = plot_roc_curve(model, input, target)
  plt.show()


In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

MNB = MultinomialNB()
for x, y in batch_xy(X_train, Y_train, batchsize):
  MNB.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
  
y_pred = np.array([])
for x in batch_x(X_test, batchsize):
  y_pred = np.append(y_pred, MNB.predict(x))

accuracy_score_mnb = metrics.accuracy_score(y_pred, Y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')
report("Initial MNB with text counts from Count Vectorizer", Y_test, y_pred)
#roc("Initial MNB with text counts from Count Vectorizer", MNB, X_test, Y_test)

accuracy_score_mnb = 81.01%
[Initial MNB with text counts from Count Vectorizer] classification report:
              precision    recall  f1-score   support

    Negative       0.78      0.84      0.81    192007
     Neutral       0.84      0.81      0.82    261925
    Positive       0.81      0.77      0.79    165012

    accuracy                           0.81    618944
   macro avg       0.81      0.81      0.81    618944
weighted avg       0.81      0.81      0.81    618944



In [22]:
#searching for best learning rate and training MNB
MNB = MultinomialNB()
parameters = {'alpha':[1,0.5,0.3,0.1,0.01],'fit_prior':(True,False)}
search =  GridSearchCV(MNB,parameters)
search.fit(X_train[0:0 + batchsize, :],Y_train[0:0 + batchsize]) #Only run through batchsize to determine the best params (whole dataset takes too much resource)
bestparams =search.best_params_
bestparams

ValueError: ignored

In [None]:
MNB_best = MultinomialNB(alpha=bestparams['alpha'],fit_prior=bestparams['fit_prior'])
#MNB_best.fit(X_train,Y_train)
#y_pred = MNB_best.predict(X_test)
for x, y in batch_xy(X_train, Y_train, batchsize):
  MNB_best.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
  
y_pred = np.array([])
for x in batch_x(X_test, batchsize):
  y_pred = np.append(y_pred, MNB_best.predict(x))

accuracy_score_mnb = metrics.accuracy_score(y_pred, Y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')
report("Best Params MNB with text counts from Count Vectorizer", Y_test, y_pred)
#roc("Best Params MNB with text counts from Count Vectorizer", MNB_best, X_test, Y_test)

My system doesn't have enough RAM to run GaussianNB for 1M entries as it requires to convert X_train to a dense array format.So commented it for now. Can may be try running it on some other compute resource. When I had tried running GNB with a dataset of 5k elements it compartively performed worse that MNB and BNB. But in the tfidf vectorizer method GNB had performed a little better than the other two. I think MNB is the popular model for sentiment analysis so we will probably focus more on MNB and try to optimize that the most. 

In [None]:
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()

for x, y in batch_xy(X_train, Y_train, batchsize):
  GNB.partial_fit(x.todense(), y, classes=['Positive','Neutral','Negative'])
  
y_pred = np.array([])
for x in batch_x(X_test, batchsize):
  y_pred = np.append(y_pred, GNB.predict(x.todense()))

#GNB.fit(X_train.todense(), Y_train)

accuracy_score_gnb = metrics.accuracy_score(y_pred, Y_test)
print('accuracy_score_gnb = '+str('{:4.2f}'.format(accuracy_score_gnb*100))+'%')
report("Initial GNB with text counts from Count Vectorizer", Y_test, y_pred)
#roc("Initial GNB with text counts from Count Vectorizer", GNB, X_test, Y_test)

#GNB does not have parameter search? 

In [20]:

from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()

for x, y in batch_xy(X_train, Y_train, batchsize):
  BNB.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
  
y_pred = np.array([])
for x in batch_x(X_test, batchsize):
  y_pred = np.append(y_pred, BNB.predict(x))

#BNB.fit(X_train, Y_train)
accuracy_score_bnb = metrics.accuracy_score(y_pred,Y_test)
print('BNB accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
report("Initial BNB with text counts from Count Vectorizer", Y_test, y_pred)
#roc("Initial BNB with text counts from Count Vectorizer", BNB, X_test, Y_test)

BNB accuracy = 74.69%
[Initial BNB with text counts from Count Vectorizer] classification report:
              precision    recall  f1-score   support

    Negative       0.78      0.71      0.74    192007
     Neutral       0.72      0.83      0.77    261925
    Positive       0.77      0.65      0.70    165012

    accuracy                           0.75    618944
   macro avg       0.75      0.73      0.74    618944
weighted avg       0.75      0.75      0.74    618944

[Initial BNB with text counts from Count Vectorizer] roc curve:


ValueError: ignored

<Figure size 432x288 with 0 Axes>

In [None]:
#searching for best learning rate and training BNB
BNB = BernoulliNB()
parameters = {'alpha':[1,0.5,0.3,0.1,0.01],'fit_prior':(True,False)}
search =  GridSearchCV(BNB,parameters)
search.fit(X_train[0:0 + batchsize, :],Y_train[0:0 + batchsize]) #Only run through batchsize to determine the best params (whole dataset takes too much resource)
bestparams =search.best_params_
bestparams

In [None]:
BNB_best = BernoulliNB(alpha=bestparams['alpha'],fit_prior=bestparams['fit_prior'])
for x, y in batch_xy(X_train, Y_train, batchsize):
  BNB_best.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
  
y_pred = np.array([])
for x in batch_x(X_test, batchsize):
  y_pred = np.append(y_pred, BNB_best.predict(x))

accuracy_score_bnb = metrics.accuracy_score(y_pred,Y_test)
print('BNB accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
report("Best Params BNB with text counts from Count Vectorizer", Y_test, y_pred)
#roc("Best Params BNB with text counts from Count Vectorizer", BNB_best, X_test, Y_test)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english",norm="l2")
text_count_2 = tfidf.fit_transform(data['processed_text'])

#splitting the data in test and training
#x_train, x_test, y_train, y_test = train_test_split(text_count_2, data['Sentiment'],test_size=0.25,random_state=5)
x_train, x_test, y_train, y_test = train_test_split(text_count_2, data['Vote'],test_size=0.25,random_state=5)

#Models - reinitialize models since running fit on already fitted model may return something weird (I could be wrong on this, not sure how sklearn fit works initially)
MNB = MultinomialNB()
for x, y in batch_xy(x_train, y_train, batchsize):
  MNB.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
y_pred = np.array([])
for x in batch_x(x_test, batchsize):
  y_pred = np.append(y_pred, MNB.predict(x))
accuracy_score_mnb = metrics.accuracy_score(y_pred, y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')
report("Initial MNB with tfidf count", y_test, y_pred)
#roc("Initial MNB with tfidf count", MNB, x_test, y_test)

BNB = BernoulliNB()
for x, y in batch_xy(x_train, y_train, batchsize):
  BNB.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
y_pred2 = np.array([])
for x in batch_x(x_test, batchsize):
  y_pred2 = np.append(y_pred2, BNB.predict(x))
accuracy_score_bnb = metrics.accuracy_score(y_pred2, y_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
report("Initial BNB with tfidf count", y_test, y_pred2)
#roc("Initial BNB with tfidf count", BNB, x_test, y_test)

GNB = GaussianNB()
for x, y in batch_xy(x_train, y_train, batchsize):
  GNB.partial_fit(x.todense(), y, classes=['Positive','Neutral','Negative'])
y_pred3 = np.array([])
for x in batch_x(x_test, batchsize):
  y_pred3 = np.append(y_pred3, GNB.predict(x.todense()))
accuracy_score_gnb = metrics.accuracy_score(y_pred3, y_test)
print('accuracy_score_gnb = '+str('{:4.2f}'.format(accuracy_score_gnb*100))+'%')
report("Initial GNB with tfidf count", y_test, y_pred3)
#roc("Initial GNB with tfidf count", GNB, x_test, y_test)

In [None]:
#searching for best params
MNB = MultinomialNB()
parameters = {'alpha':[1,0.5,0.3,0.1,0.01],'fit_prior':(True,False)}
search =  GridSearchCV(MNB,parameters)
search.fit(x_train[0:0 + batchsize, :],y_train[0:0 + batchsize]) #Only run through batchsize to determine the best params (whole dataset takes too much resource)
bestparams_mnb =search.best_params_
print(bestparams_mnb)

BNB = BernoulliNB()
parameters = {'alpha':[1,0.5,0.3,0.1,0.01],'fit_prior':(True,False)}
search =  GridSearchCV(BNB,parameters)
search.fit(x_train[0:0 + batchsize, :],y_train[0:0 + batchsize]) #Only run through batchsize to determine the best params (whole dataset takes too much resource)
bestparams_bnb =search.best_params_
print(bestparams_bnb)

In [None]:
MNB_best = MultinomialNB(alpha=bestparams_mnb['alpha'],fit_prior=bestparams_mnb['fit_prior'])
for x, y in batch_xy(x_train, y_train, batchsize):
  MNB_best.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
y_pred = np.array([])
for x in batch_x(x_test, batchsize):
  y_pred = np.append(y_pred, MNB_best.predict(x))
accuracy_score_mnb = metrics.accuracy_score(y_pred, y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')
report("Best params MNB with tfidf count", y_test, y_pred)
#roc("Best params MNB with tfidf count", MNB_best, x_test, y_test)

BNB_best = BernoulliNB(alpha=bestparams_bnb['alpha'],fit_prior=bestparams_bnb['fit_prior'])
for x, y in batch_xy(x_train, y_train, batchsize):
  BNB_best.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
y_pred2 = np.array([])
for x in batch_x(x_test, batchsize):
  y_pred2 = np.append(y_pred2, BNB_best.predict(x))
accuracy_score_bnb = metrics.accuracy_score(y_pred2, y_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
report("Best params BNB with tfidf count", y_test, y_pred2)
#roc("Best params BNB with tfidf count", BNB_best, x_test, y_test)

In [None]:

text_counts.toarray()
# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
fooTfmer = TfidfTransformer()

# Again, fit and transform
docs_tfidf = fooTfmer.fit_transform(text_counts)

#splitting the data in test and training
#from sklearn.model_selection() import train_test_split()
#x_train, x_test, y_train, y_test = train_test_split(docs_tfidf, data['Sentiment'],test_size=0.25,random_state=5)
x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(docs_tfidf, data['Vote'],test_size=0.25,random_state=5)

MNB = MultinomialNB()
for x, y in batch_xy(x_train_tf, y_train_tf, batchsize):
  MNB.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
y_pred = np.array([])
for x in batch_x(x_test_tf, batchsize):
  y_pred = np.append(y_pred, MNB.predict(x))
accuracy_score_mnb = metrics.accuracy_score(y_pred, y_test_tf)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')
report("Initial MNB with tfidf frequency", y_test_tf, y_pred)
#roc("Initial MNB with tfidf frequency", MNB, x_test_tf, y_test_tf)

BNB = BernoulliNB()
for x, y in batch_xy(x_train_tf, y_train_tf, batchsize):
  BNB.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
y_pred2 = np.array([])
for x in batch_x(x_test_tf, batchsize):
  y_pred2 = np.append(y_pred2, BNB.predict(x))
accuracy_score_bnb = metrics.accuracy_score(y_pred2, y_test_tf)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
report("Initial BNB with tfidf frequency", y_test_tf, y_pred2)
#roc("Initial BNB with tfidf frequency", BNB, x_test_tf, y_test_tf)

GNB = GaussianNB()
for x, y in batch_xy(x_train_tf, y_train_tf, batchsize):
  GNB.partial_fit(x.todense(), y, classes=['Positive','Neutral','Negative'])
y_pred3 = np.array([])
for x in batch_x(x_test_tf, batchsize):
  y_pred3 = np.append(y_pred3, GNB.predict(x.todense()))
accuracy_score_gnb = metrics.accuracy_score(y_pred3, y_test_tf)
print('accuracy_score_gnb = '+str('{:4.2f}'.format(accuracy_score_gnb*100))+'%')
report("Initial GNB with tfidf frequency", y_test_tf, y_pred3)
#roc("Initial GNB with tfidf frequency", GNB, x_test_tf, y_test_tf)


In [None]:
#searching for best params
MNB = MultinomialNB()
parameters = {'alpha':[1,0.5,0.3,0.1,0.01],'fit_prior':(True,False)}
search =  GridSearchCV(MNB,parameters)
search.fit(x_train_tf[0:0 + batchsize, :],y_train_tf[0:0 + batchsize]) #Only run through batchsize to determine the best params (whole dataset takes too much resource)
bestparams_mnb =search.best_params_
print(bestparams_mnb)

BNB = BernoulliNB()
parameters = {'alpha':[1,0.5,0.3,0.1,0.01],'fit_prior':(True,False)}
search =  GridSearchCV(BNB,parameters)
search.fit(x_train_tf[0:0 + batchsize, :],y_train_tf[0:0 + batchsize]) #Only run through batchsize to determine the best params (whole dataset takes too much resource)
bestparams_bnb =search.best_params_
print(bestparams_bnb)

In [None]:
MNB_best = MultinomialNB(alpha=bestparams_mnb['alpha'],fit_prior=bestparams_mnb['fit_prior'])
for x, y in batch_xy(x_train_tf, y_train_tf, batchsize):
  MNB_best.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
y_pred = np.array([])
for x in batch_x(x_test_tf, batchsize):
  y_pred = np.append(y_pred, MNB_best.predict(x))
accuracy_score_mnb = metrics.accuracy_score(y_pred, y_test_tf)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')
report("Best params MNB with tfidf count", y_test_tf, y_pred)
#roc("Best params MNB with tfidf count", MNB_best, x_test_tf, y_test_tf)

BNB_best = BernoulliNB(alpha=bestparams_bnb['alpha'],fit_prior=bestparams_bnb['fit_prior'])
for x, y in batch_xy(x_train_tf, y_train_tf, batchsize):
  BNB_best.partial_fit(x, y, classes=['Positive','Neutral','Negative'])
y_pred2 = np.array([])
for x in batch_x(x_test_tf, batchsize):
  y_pred2 = np.append(y_pred2, BNB_best.predict(x))
accuracy_score_bnb = metrics.accuracy_score(y_pred2, y_test_tf)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
report("Best params BNB with tfidf count", y_test_tf, y_pred2)
#roc("Best params BNB with tfidf count", BNB_best, x_test_tf, y_test_tf)

Things to do:  
1. Run for the whole voted dataset on a larger compute resource(suggested by TA). Depending on need can make use of the partial fit function to do batching. 
2. Try to form a pipeline of countvectorizer, tfidf-transformer, MNB and then try to tune the parameters (ngram range,tfidf-norm,MNB alpha etc.) for the whole pipeline using GridsearchCV.
3. Try out other optimizations and tuning if exists.  
4. Analyze the performance of each of the three models(MNB,GNB,BNB) by using metrics like classification report etc. 
