In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.corpus import words
from spellchecker import SpellChecker

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

from nltk.stem import WordNetLemmatizer

In [2]:
train = pd.read_csv('tweet_NLP_train.csv', encoding="ISO-8859-1")
test = pd.read_csv('tweet_NLP_test.csv', encoding="ISO-8859-1")
df = train.append(test, sort = False)

In [3]:
df.head(20)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,As news of the regionÂs first confirmed COVID...,Positive
6,3805,48757,"35.926541,-78.753267",16-03-2020,Cashier at grocery store was sharing his insig...,Positive
7,3806,48758,Austria,16-03-2020,Was at the supermarket today. Didn't buy toile...,Neutral
8,3807,48759,"Atlanta, GA USA",16-03-2020,Due to COVID-19 our retail store and classroom...,Positive
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"For corona prevention,we should stop to buy th...",Negative


In [4]:
df['Sentiment'].unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [5]:
mapping = {'Neutral' : 0, 'Positive' : 1, 'Extremely Negative' : -1, 'Negative' : -1,'Extremely Positive' : 1}
df['label'] = df['Sentiment'].map(mapping)

In [6]:
df.head(20)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,label
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,0
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,1
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,1
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,1
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,-1
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,As news of the regionÂs first confirmed COVID...,Positive,1
6,3805,48757,"35.926541,-78.753267",16-03-2020,Cashier at grocery store was sharing his insig...,Positive,1
7,3806,48758,Austria,16-03-2020,Was at the supermarket today. Didn't buy toile...,Neutral,0
8,3807,48759,"Atlanta, GA USA",16-03-2020,Due to COVID-19 our retail store and classroom...,Positive,1
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"For corona prevention,we should stop to buy th...",Negative,-1


In [7]:
columns_to_keep = ['OriginalTweet','label']
df = df[columns_to_keep]

In [8]:
df.head(20)

Unnamed: 0,OriginalTweet,label
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,0
1,advice Talk to your neighbours family to excha...,1
2,Coronavirus Australia: Woolworths to give elde...,1
3,My food stock is not the only one which is emp...,1
4,"Me, ready to go at supermarket during the #COV...",-1
5,As news of the regionÂs first confirmed COVID...,1
6,Cashier at grocery store was sharing his insig...,1
7,Was at the supermarket today. Didn't buy toile...,0
8,Due to COVID-19 our retail store and classroom...,1
9,"For corona prevention,we should stop to buy th...",-1


In [9]:
df.dropna(inplace=True)

### Removing Urls

In [10]:
def url_cleaning(tweet):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'link', tweet)

df['OriginalTweet'] = df['OriginalTweet'].apply(url_cleaning)
display(df['OriginalTweet'].head(5))

0    @MeNyrbie @Phil_Gahan @Chrisitv link and link ...
1    advice Talk to your neighbours family to excha...
2    Coronavirus Australia: Woolworths to give elde...
3    My food stock is not the only one which is emp...
4    Me, ready to go at supermarket during the #COV...
Name: OriginalTweet, dtype: object

In [11]:

def text_cleaning_1(tweet):
    tweet = re.sub(r" usa ", " America ", tweet)
    tweet = re.sub(r" USA ", " America ", tweet)
    tweet = re.sub(r" u s ", " America ", tweet)
    tweet = re.sub(r" uk ", " England ", tweet)
    tweet = re.sub(r" UK ", " England ", tweet)
    tweet = re.sub(r"USAgov", "USA government", tweet)
    tweet = re.sub(r"the US", "America", tweet)
    tweet = re.sub(r"Coronavirus", " covid ", tweet)
    tweet = re.sub(r"Covid19", " covid ", tweet)
    tweet = re.sub(r"\W", " ", tweet)
    tweet = re.sub(r"_", " ", tweet)
    return str(tweet)

In [12]:
df['OriginalTweet'] = df['OriginalTweet'].apply(text_cleaning_1)
display(df['OriginalTweet'].head(5))

0     MeNyrbie  Phil Gahan  Chrisitv link and link ...
1    advice Talk to your neighbours family to excha...
2     covid  Australia  Woolworths to give elderly ...
3    My food stock is not the only one which is emp...
4    Me  ready to go at supermarket during the  COV...
Name: OriginalTweet, dtype: object

In [13]:
df.head(20)

Unnamed: 0,OriginalTweet,label
0,MeNyrbie Phil Gahan Chrisitv link and link ...,0
1,advice Talk to your neighbours family to excha...,1
2,covid Australia Woolworths to give elderly ...,1
3,My food stock is not the only one which is emp...,1
4,Me ready to go at supermarket during the COV...,-1
5,As news of the regionÂ s first confirmed COVID...,1
6,Cashier at grocery store was sharing his insig...,1
7,Was at the supermarket today Didn t buy toile...,0
8,Due to COVID 19 our retail store and classroom...,1
9,For corona prevention we should stop to buy th...,-1


### Lower Case

In [14]:
df['OriginalTweet'] = df['OriginalTweet'].str.lower()

### Removing Stop words

In [15]:
def stop_word(tweet): 
    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(tweet) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    return ' '.join(filtered_sentence)

In [16]:
df['OriginalTweet'] = df['OriginalTweet'].apply(stop_word)

In [17]:
df['OriginalTweet'].head(20)

0           menyrbie phil gahan chrisitv link link link
1     advice talk neighbours family exchange phone n...
2     covid australia woolworths give elderly disabl...
3     food stock one empty please panic enough food ...
4     ready go supermarket covid19 outbreak paranoid...
5     news regionâ first confirmed covid 19 case cam...
6     cashier grocery store sharing insights covid 1...
7     supermarket today buy toilet paper rebel toile...
8     due covid 19 retail store classroom atlanta op...
9     corona prevention stop buy things cash use onl...
10    month crowding supermarkets restaurants howeve...
11    due covid 19 situation increased demand food p...
12    horningsea caring community letâ look less cap...
13    need stock food amazon deliver whatever need c...
14    adara releases covid 19 resource center travel...
15    lines grocery store unpredictable eating safe ...
16                                              13 link
17    eyeonthearctic 16mar20 russia consumer sur

In [18]:
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    words = text.split()
    for word in words:
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        elif word not in misspelled_words:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [19]:
#df['OriginalTweet'] = df['OriginalTweet'].apply(correct_spellings)

### Tokenize and Lemmatizer

In [20]:
df['OriginalTweet'] = df['OriginalTweet'].apply(word_tokenize)
lem = WordNetLemmatizer()
def lemma_wordnet(input):
    return [lem.lemmatize(w) for w in input]
df['OriginalTweet'] = df['OriginalTweet'].apply(lemma_wordnet)

In [21]:
def combine_word(tweet):
    return " ".join(tweet)
df['OriginalTweet'] = df['OriginalTweet'].apply(combine_word)

#### Split data into Train and Test data sets 

In [22]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df['OriginalTweet'], 
                                                    df['label'], 
                                                    random_state=0)

## Vectorization with CountVectorizer

In [23]:
vect = CountVectorizer(min_df=5, ngram_range=[1,4], analyzer='char_wb').fit(X_train)
X_train_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)

## Vectorization with TFID vectorizer

In [24]:
vect = TfidfVectorizer(min_df=3, ngram_range=[1,4]).fit(X_train)
X_train_vect_TFID = vect.transform(X_train)
X_test_vect_TFID = vect.transform(X_test)

## Applying ML

In [25]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

#### MultinomialNB

In [26]:
print("MultinomialNB with CountVectorizer\n")
alpha = [0.01, 0.1, 1.0, 5.0, 10.0, 20.0, 100.0]
for value in alpha:
    model = MultinomialNB(alpha = value).fit(X_train_vect, y_train)
    y_predicted = model.predict(X_test_vect)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With alpha set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

MultinomialNB with CountVectorizer

With alpha set to 0.01, AUC score of model is 0.7152582904015604 and Accuracy Score is 0.6274579588931399

With alpha set to 0.1, AUC score of model is 0.7169100234286234 and Accuracy Score is 0.6268351276803986

With alpha set to 1.0, AUC score of model is 0.7176860068233121 and Accuracy Score is 0.6264792241302607

With alpha set to 5.0, AUC score of model is 0.7131425010541182 and Accuracy Score is 0.6379571136222084

With alpha set to 10.0, AUC score of model is 0.6924747735030957 and Accuracy Score is 0.6364445235341223

With alpha set to 20.0, AUC score of model is 0.6576020460674498 and Accuracy Score is 0.6212296467657265

With alpha set to 100.0, AUC score of model is 0.5613603080773472 and Accuracy Score is 0.5115223774357149



In [27]:
print("MultinomialNB with Tfid Vectorizer\n")
alpha = [0.01, 0.1, 1.0, 5.0, 10.0, 20.0, 100.0]
for value in alpha:
    model = MultinomialNB(alpha = value).fit(X_train_vect_TFID, y_train)
    y_predicted = model.predict(X_test_vect_TFID)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With alpha set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

MultinomialNB with Tfid Vectorizer

With alpha set to 0.01, AUC score of model is 0.7005372663131967 and Accuracy Score is 0.6506806655396388

With alpha set to 0.1, AUC score of model is 0.7071882902911507 and Accuracy Score is 0.6623365068066553

With alpha set to 1.0, AUC score of model is 0.6649646527060235 and Accuracy Score is 0.6408043420233117

With alpha set to 5.0, AUC score of model is 0.6262331017236796 and Accuracy Score is 0.5994305543197793

With alpha set to 10.0, AUC score of model is 0.6040335787584765 and Accuracy Score is 0.5702464632084705

With alpha set to 20.0, AUC score of model is 0.5806023159440611 and Accuracy Score is 0.5386600231337307

With alpha set to 100.0, AUC score of model is 0.5271011690940559 and Accuracy Score is 0.46587774713052765



####  Decision Tree Classifier

In [28]:
print("DecisionTreeClassifier with CountVectorizer\n")
depth = [3,6,9,12,15]
for value in depth:
    model = DecisionTreeClassifier(max_depth = value).fit(X_train_vect, y_train)
    y_predicted = model.predict(X_test_vect)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With max_depth set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

DecisionTreeClassifier with CountVectorizer

With max_depth set to 3, AUC score of model is 0.6119052993532488 and Accuracy Score is 0.5039594269952843

With max_depth set to 6, AUC score of model is 0.5832984382301798 and Accuracy Score is 0.5105436426728357

With max_depth set to 9, AUC score of model is 0.65078846172782 and Accuracy Score is 0.5451552629237477

With max_depth set to 12, AUC score of model is 0.6626382620137048 and Accuracy Score is 0.5665984518195569

With max_depth set to 15, AUC score of model is 0.652440245802373 and Accuracy Score is 0.5714921256339532



In [29]:
print("DecisionTreeClassifier with CountVectorizer\n")
depth = [3,6,9,12,15]
for value in depth:
    model = DecisionTreeClassifier(max_depth = value).fit(X_train_vect_TFID, y_train)
    y_predicted = model.predict(X_test_vect_TFID)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With max_depth set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

DecisionTreeClassifier with CountVectorizer

With max_depth set to 3, AUC score of model is 0.5460822452617824 and Accuracy Score is 0.49070201975264705

With max_depth set to 6, AUC score of model is 0.5458518581024593 and Accuracy Score is 0.4899012367648367

With max_depth set to 9, AUC score of model is 0.6304185258529725 and Accuracy Score is 0.5179286413381974

With max_depth set to 12, AUC score of model is 0.6448126574609092 and Accuracy Score is 0.5405285167719548

With max_depth set to 15, AUC score of model is 0.649510993824582 and Accuracy Score is 0.5488922502001957



### Logistic Regression

In [30]:
print("Logistic Regression with CountVectorizer\n")
C = [ 1, 5, 10, 20, 100, 1000]
for value in C:
    model = LogisticRegression(C = value,solver='lbfgs').fit(X_train_vect, y_train)
    y_predicted = model.predict(X_test_vect)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With C set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

Logistic Regression with CountVectorizer



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 1, AUC score of model is 0.7595186879717106 and Accuracy Score is 0.7032654150725154



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 5, AUC score of model is 0.7477617730234246 and Accuracy Score is 0.6908087908176884



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 10, AUC score of model is 0.7539396935105875 and Accuracy Score is 0.7002402348963431



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 20, AUC score of model is 0.7516635389202643 and Accuracy Score is 0.6973040306077053



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 100, AUC score of model is 0.7537919641865741 and Accuracy Score is 0.6979268618204466

With C set to 1000, AUC score of model is 0.7556868151358375 and Accuracy Score is 0.7020197526470326



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [31]:
print("Logistic Regression with Tfid Vectorizer\n")
C = [ 1, 5, 10, 20, 100, 1000]
for value in C:
    model = LogisticRegression(C = value,solver='lbfgs').fit(X_train_vect_TFID, y_train);
    y_predicted = model.predict(X_test_vect_TFID); 
    score = multiclass_roc_auc_score(y_test, y_predicted);
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With C set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n");

Logistic Regression with Tfid Vectorizer



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 1, AUC score of model is 0.8006730398519283 and Accuracy Score is 0.7693744995106326



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 5, AUC score of model is 0.8166481576763278 and Accuracy Score is 0.7808523890025802



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 10, AUC score of model is 0.8185219863882454 and Accuracy Score is 0.782098051428063



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 20, AUC score of model is 0.8130204230776438 and Accuracy Score is 0.7745351009876323



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


With C set to 100, AUC score of model is 0.815886284009923 and Accuracy Score is 0.7767594981759943

With C set to 1000, AUC score of model is 0.8118984612045869 and Accuracy Score is 0.7711540172613222



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### Rigid

In [32]:
print("Rigid with Tfid Vectorizer\n")
C = [ 1, 5, 10, 20, 100, 1000]
for value in C:
    model = RidgeClassifier(alpha = value).fit(X_train_vect_TFID, y_train)
    y_predicted = model.predict(X_test_vect_TFID)
    score = accuracy_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With C set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n");

Rigid with Tfid Vectorizer

With C set to 1, AUC score of model is 0.7643918498087019 and Accuracy Score is 0.7643918498087019

With C set to 5, AUC score of model is 0.7565619717056677 and Accuracy Score is 0.7565619717056677

With C set to 10, AUC score of model is 0.7328943856214966 and Accuracy Score is 0.7328943856214966

With C set to 20, AUC score of model is 0.6970371029451019 and Accuracy Score is 0.6970371029451019

With C set to 100, AUC score of model is 0.6022777827208826 and Accuracy Score is 0.6022777827208826

With C set to 1000, AUC score of model is 0.43518106593113265 and Accuracy Score is 0.43518106593113265

