In [1]:
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle
import joblib

## Loading the dataset

In [2]:
df_test = pd.read_csv('./Datasets/twitter_validation.csv')

df_test.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


## Preprocessing

In [3]:
df_test.rename(columns={'3364':'ID','Facebook':'Locations','Irrelevant':'Sentiment',"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣":'Text'},inplace=True)

In [4]:
df_test.head()

Unnamed: 0,ID,Locations,Sentiment,Text
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [5]:
df_test = df_test[df_test['Sentiment']!='Neutral']
df_test = df_test[df_test['Sentiment']!='Irrelevant']

df_test['Sentiment'].value_counts()

Sentiment
Positive    277
Negative    266
Name: count, dtype: int64

In [34]:
df_test.isna().sum()

ID           0
Locations    0
Sentiment    0
Text         0
dtype: int64

## Text Preprocessing

In [6]:
lemmatizer = WordNetLemmatizer()
sentences_test = df_test['Text'].values
corpus_test = []

for sent in tqdm(sentences_test):
    sent = sent.lower()
    sent = re.sub(r'[^a-z0-9 ]', ' ', sent)
    sent = sent.split()
    sent = [w for w in sent if w not in stopwords.words('english')]
    sent = ' '.join(sent)
    sent = sent.split()
    sent = [lemmatizer.lemmatize(word) for word in sent]
    sent = ' '.join(sent)
    corpus_test.append(sent)

corpus_test[:10]

100%|██████████| 543/543 [00:04<00:00, 125.69it/s]


['microsoft pay word function poorly samsungus chromebook',
 'csgo matchmaking full closet hacking truly awful game',
 'hi eahelp madeleine mccann cellar past 13 year little sneaky thing escaped whilst loading fifa point took card use paypal account working help resolve please',
 'thank eamaddennfl new te austin hooper orange brown brown austinhooper18 pic twitter com grg4xzfkon',
 'rocket league sea thief rainbow six siege love playing three stream best stream twitch rocketleague seaofthieves rainbowsixsiege follow',
 'as still knee deep assassin creed odyssey way anytime soon lmao',
 'fix jesus please fix world going playstation askplaystation playstationsup treyarch callofduty negative 345 silver wolf error code pic twitter com ziryhrf59q',
 'professional dota 2 scene fucking exploding completely welcome get garbage',
 'itching assassinate tccgif assassinscreedblackflag assassinscreed thecapturedcollective pic twitter com vv8mogtcjw',
 'fredtjoseph hey fred comcast cut cable verizon

## Loading TF-IDF model

In [7]:
tfidf = joblib.load('tfidf_sentiment_model.joblib')

In [8]:
encoded_text = tfidf.transform(corpus_test)

In [9]:
encoded_text.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
tfidf.get_feature_names_out()

array(['00', '000', '00011', ..., 'zzgi8xvk7t', 'zzvfsrhewg', 'zzz'],
      dtype=object)

## Model preparation

In [16]:
y_test = pd.get_dummies(df_test['Sentiment'])["Positive"].values

y_test = y_test.astype(int)

y_test

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,

In [18]:
encoded_text.shape

(543, 17197)

In [19]:
y_test.shape

(543,)

In [11]:
sentiment_model = pickle.load(open('model.mdl', 'rb'))

In [20]:
y_pred = sentiment_model.predict(encoded_text)

y_pred

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,

## Model Evaluation

In [35]:
accuracy_score(y_pred, y_test)*100

93.00184162062615