In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from textblob import TextBlob

#Data preprocessing to removes @usernames,urls,symbols and makes all text lowercase
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я]+',' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()

For now I have tried using just the textblob labelled dataset as input. I tried out for smaller 5k dataset and then 1M rows. Even for 1M I was able to do without using partial_fit (batching) since we anyway use the counts of words which comes out to be a sparse matrix. Might have to do batching for the whole dataset though. Will have to replace the dataset with the actual voted labels from Li. 

In [2]:
#load data from csv/excel/json
#data=pd.read_csv('textblob_sample_5k.csv',encoding = 'unicode_escape')
data=pd.read_csv('textblob_sentiment_1M.csv',encoding = "ISO-8859–1")

data.head()

Unnamed: 0,id_str,full_text,Sentiment
0,1278320583718178816,Business Group Complains Trump H-1B Reform Boo...,Negative
1,1278310669885026305,"""Who's the absent candidate now?...'Sleepy Joe...",Positive
2,1278346691171586049,That moron trump vows to veto the Defense Bill...,Negative
3,1278368973948694528,@ClareTyne @mesainy @HKrassenstein @realDonald...,Neutral
4,1278303504071905282,"1. Funny how Biden barely criticizes Putin, an...",Positive


In [3]:
data["processed_text"] =  data['full_text'].apply(preprocess_text)

In [4]:
pd.options.display.max_colwidth = 150
data.head()


Unnamed: 0,id_str,full_text,Sentiment,processed_text
0,1278320583718178816,Business Group Complains Trump H-1B Reform Boosting U.S. Graduates. Big tech is whining that they will have to hire American instead of cheap fore...,Negative,business group complains trump h b reform boosting u s graduates big tech is whining that they will have to hire american instead of cheap foreign...
1,1278310669885026305,"""Who's the absent candidate now?...'Sleepy Joe' is signaling that he's very much awake -- and dialed into a moment where Trump's leadership is rip...",Positive,who s the absent candidate now sleepy joe is signaling that he s very much awake and dialed into a moment where trump s leadership is ripe for que...
2,1278346691171586049,"That moron trump vows to veto the Defense Bill if it includes renaming bases. So once again, military salaries and defense preparedness are second...",Negative,that moron trump vows to veto the defense bill if it includes renaming bases so once again military salaries and defense preparedness are second i...
3,1278368973948694528,@ClareTyne @mesainy @HKrassenstein @realDonaldTrump @NYCMayor @JoeBiden Yep. Torturing that guy because the Dems donât care.,Neutral,yep torturing that guy because the dems don t care
4,1278303504071905282,"1. Funny how Biden barely criticizes Putin, and on the China virus doesnât criticize Xi.Â Instead, he smears Trump.Â The reason is the Democra...",Positive,funny how biden barely criticizes putin and on the china virus doesn t criticize xi instead he smears trump the reason is the democrats don t real...


In [5]:
#Converting the text into tokens and getting the counts of each token based on the ngrams specified. 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
#cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = nltk.word_tokenize)
#cv=CountVectorizer(stop_words='english')
text_counts = cv.fit_transform(data['processed_text'])

In [6]:
text_counts
#cv.vocabulary_
# Import LabelEncoder
#from sklearn import preprocessing
#creating labelEncoder
#le = preprocessing.LabelEncoder()
#label = le.fit_transform(data['Sentiment'])

<1048575x158002 sparse matrix of type '<class 'numpy.int64'>'
	with 12277549 stored elements in Compressed Sparse Row format>

The three Naive bayes models with support for partial fit are MNB,GNB and BNB. So I have tried implementing the three of them.
Multinomial Naive Bayes(MNB) is the one thats particularly good with word counts, so we can try to tune that the most and probably get better accuracy.  
I have tried three apporaches:  
1. Using the text counts from Count Vectorizer as input
2. Using the counts from tf-idf vectorizer as input
3. converting the counts from part 1 as tf-idf frequency using transformer and feeding to model  
I think approach 2 and 3 are supposed to work equivantely but I noticed a slight increase in accuracy for the MNB model with approach 3 while using the smaller dataset. (couldn't try approch 3 for 1M dataset as memory required was too large)  


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

X_train, X_test, Y_train, Y_test = train_test_split(text_counts,data['Sentiment'], test_size=0.25, random_state=5)

In [8]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
from sklearn import metrics
y_pred = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(y_pred, Y_test)
accuracy_score


0.7616004943847656

In [9]:
#searching for best learning rate and training MNB
MNB = MultinomialNB()
parameters = {'alpha':[1,0.5,0.3,0.1,0.01],'fit_prior':(True,False)}
search =  GridSearchCV(MNB,parameters)
search.fit(X_train,Y_train)
bestparams =search.best_params_
bestparams

{'alpha': 1, 'fit_prior': False}

In [10]:
MNB_best = MultinomialNB(alpha=bestparams['alpha'],fit_prior=bestparams['fit_prior'])
MNB_best.fit(X_train,Y_train)
y_pred = MNB_best.predict(X_test)
accuracy_score = metrics.accuracy_score(y_pred, Y_test)
accuracy_score


0.7676010131835938

My system doesn't have enough RAM to run GaussianNB for 1M entries as it requires to convert X_train to a dense array format.So commented it for now. Can may be try running it on some other compute resource. When I had tried running GNB with a dataset of 5k elements it compartively performed worse that MNB and BNB. But in the tfidf vectorizer method GNB had performed a little better than the other two. I think MNB is the popular model for sentiment analysis so we will probably focus more on MNB and try to optimize that the most. 

In [11]:
"""
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()
GNB.fit(X_train.todense(), Y_train)
accuracy_score = metrics.accuracy_score(GNB.predict(X_test.todense()),Y_test)
accuracy_score
"""

'\nfrom sklearn.naive_bayes import GaussianNB\nGNB = GaussianNB()\nGNB.fit(X_train.todense(), Y_train)\naccuracy_score = metrics.accuracy_score(GNB.predict(X_test.todense()),Y_test)\naccuracy_score\n'

In [12]:

from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
BNB.fit(X_train, Y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(X_test),Y_test)
print('BNB accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

BNB accuracy = 75.05%


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english",norm="l2")
text_count_2 = tfidf.fit_transform(data['processed_text'])

#splitting the data in test and training
x_train, x_test, y_train, y_test = train_test_split(text_count_2, data['Sentiment'],test_size=0.25,random_state=5)

#Models
MNB.fit(x_train, y_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

BNB.fit(x_train, y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

#GNB.fit(x_train.todense(), y_train)
#accuracy_score_gnb = metrics.accuracy_score(GNB.predict(x_test.todense()), y_test)
#print('accuracy_score_gnb = '+str('{:4.2f}'.format(accuracy_score_gnb*100))+'%')

accuracy_score_mnb = 69.38%
accuracy_score_bnb = 75.51%


In [14]:
"""
text_counts.toarray()
# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
fooTfmer = TfidfTransformer()

# Again, fit and transform
docs_tfidf = fooTfmer.fit_transform(text_counts)

#splitting the data in test and training
#from sklearn.model_selection() import train_test_split()
x_train, x_test, y_train, y_test = train_test_split(docs_tfidf, data['Sentiment'],test_size=0.25,random_state=5)

#defining the model
#compilimg the model -> we are going to use already used models GNB, MNB, CNB, BNB
#fitting the model
MNB.fit(x_train, y_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

BNB.fit(x_train, y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

#CNB.fit(x_train, y_train)
#accuracy_score_cnb = metrics.accuracy_score(CNB.predict(x_test), y_test)
#print('accuracy_score_cnb = '+str('{:4.2f}'.format(accuracy_score_cnb*100))+'%')

#GNB.fit(x_train.todense(), y_train)
#accuracy_score_gnb = metrics.accuracy_score(GNB.predict(x_test.todense()), y_test)
#print('accuracy_score_gnb = '+str('{:4.2f}'.format(accuracy_score_gnb*100))+'%')

"""

"\ntext_counts.toarray()\n# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfooTfmer = TfidfTransformer()\n\n# Again, fit and transform\ndocs_tfidf = fooTfmer.fit_transform(text_counts)\n\n#splitting the data in test and training\n#from sklearn.model_selection() import train_test_split()\nx_train, x_test, y_train, y_test = train_test_split(docs_tfidf, data['Sentiment'],test_size=0.25,random_state=5)\n\n#defining the model\n#compilimg the model -> we are going to use already used models GNB, MNB, CNB, BNB\n#fitting the model\nMNB.fit(x_train, y_train)\naccuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)\nprint('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')\n\nBNB.fit(x_train, y_train)\naccuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)\nprint('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100

Things to do:  
1. Run for the whole voted dataset on a larger compute resource(suggested by TA). Depending on need can make use of the partial fit function to do batching. 
2. Try to form a pipeline of countvectorizer, tfidf-transformer, MNB and then try to tune the parameters (ngram range,tfidf-norm,MNB alpha etc.) for the whole pipeline using GridsearchCV.
3. Try out other optimizations and tuning if exists.  
4. Analyze the performance of each of the three models(MNB,GNB,BNB) by using metrics like classification report etc. 
