## Liberaries

In [51]:
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle
import joblib

## Loading the dataset

In [2]:
df = pd.read_csv('./Datasets/twitter_training.csv')

df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [37]:
df_test = pd.read_csv('./Datasets/twitter_validation.csv')

df_test.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


## Preprocessing

Change the column names

In [3]:
df.rename(columns={'2401':'ID','Borderlands':'Locations','Positive':'Sentiment','im getting on borderlands and i will murder you all ,':'Text'},inplace=True)

In [38]:
df_test.rename(columns={'3364':'ID','Facebook':'Locations','Irrelevant':'Sentiment',"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣":'Text'},inplace=True)

In [4]:
df.head()

Unnamed: 0,ID,Locations,Sentiment,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [39]:
df_test.head()

Unnamed: 0,ID,Locations,Sentiment,Text
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [5]:
df['Sentiment'].value_counts()

Sentiment
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [40]:
df_test['Sentiment'].value_counts()

Sentiment
Neutral       285
Positive      277
Negative      266
Irrelevant    171
Name: count, dtype: int64

remove the neutral and irrelevant sentiments

In [6]:
df = df[df['Sentiment']!='Neutral']
df = df[df['Sentiment']!='Irrelevant']

df['Sentiment'].value_counts()

Sentiment
Negative    22542
Positive    20831
Name: count, dtype: int64

In [41]:
df_test = df_test[df_test['Sentiment']!='Neutral']
df_test = df_test[df_test['Sentiment']!='Irrelevant']

df_test['Sentiment'].value_counts()

Sentiment
Positive    277
Negative    266
Name: count, dtype: int64

remove null values

In [7]:
df.isna().sum()

df.dropna(inplace=True)

In [8]:
df.isna().sum()

ID           0
Locations    0
Sentiment    0
Text         0
dtype: int64

In [42]:
df_test.isna().sum()

ID           0
Locations    0
Sentiment    0
Text         0
dtype: int64

## Text Preprocessing

In [9]:
from nltk.stem import WordNetLemmatizer
sentences = df['Text'].values
corpus = []
lemmatizer = WordNetLemmatizer()

for sent in tqdm(sentences):
    sent = sent.lower()
    sent = re.sub(r'[^a-z0-9 ]', ' ', sent)
    sent = sent.split()
    sent = [w for w in sent if w not in stopwords.words('english')]
    sent = ' '.join(sent)
    sent = sent.split()
    sent = [lemmatizer.lemmatize(word) for word in sent]
    sent = ' '.join(sent)
    corpus.append(sent)

corpus[:10]

100%|██████████| 43012/43012 [02:16<00:00, 315.29it/s]


['coming border kill',
 'im getting borderland kill',
 'im coming borderland murder',
 'im getting borderland 2 murder',
 'im getting borderland murder',
 'spent hour making something fun know huge borderland fan maya one favorite character decided make wallpaper pc original image versus creation made enjoy pic twitter com mlsi5wf9jg',
 'spent couple hour something fun know huge borderland fan maya one favorite character decided make wallpaper pc original picture compared creation made fun pic twitter com mlsi5wf9jg',
 'spent hour something fun know huge borderland fan maya one favorite character',
 'spent hour making something fun know huge rhandlerr fan maya one favorite character decided make wallpaper pc original image versus creation made enjoy pic twitter com mlsi5wf9jg',
 '2010 spent hour making something fun know huge rhandlerr fan maya one favorite character decided make wallpaper pc original image versus creation made enjoy pic twitter com mlsi5wf9jg']

In [43]:
sentences_test = df_test['Text'].values
corpus_test = []

for sent in tqdm(sentences_test):
    sent = sent.lower()
    sent = re.sub(r'[^a-z0-9 ]', ' ', sent)
    sent = sent.split()
    sent = [w for w in sent if w not in stopwords.words('english')]
    sent = ' '.join(sent)
    sent = sent.split()
    sent = [lemmatizer.lemmatize(word) for word in sent]
    sent = ' '.join(sent)
    corpus_test.append(sent)

corpus_test[:10]

100%|██████████| 543/543 [00:03<00:00, 168.92it/s]


['microsoft pay word function poorly samsungus chromebook',
 'csgo matchmaking full closet hacking truly awful game',
 'hi eahelp madeleine mccann cellar past 13 year little sneaky thing escaped whilst loading fifa point took card use paypal account working help resolve please',
 'thank eamaddennfl new te austin hooper orange brown brown austinhooper18 pic twitter com grg4xzfkon',
 'rocket league sea thief rainbow six siege love playing three stream best stream twitch rocketleague seaofthieves rainbowsixsiege follow',
 'as still knee deep assassin creed odyssey way anytime soon lmao',
 'fix jesus please fix world going playstation askplaystation playstationsup treyarch callofduty negative 345 silver wolf error code pic twitter com ziryhrf59q',
 'professional dota 2 scene fucking exploding completely welcome get garbage',
 'itching assassinate tccgif assassinscreedblackflag assassinscreed thecapturedcollective pic twitter com vv8mogtcjw',
 'fredtjoseph hey fred comcast cut cable verizon

In [10]:
tfidf = TfidfVectorizer()

encoded_text = tfidf.fit_transform(corpus)

print(tfidf.get_feature_names_out())
encoded_text.toarray()

['00' '000' '00011' ... 'zzgi8xvk7t' 'zzvfsrhewg' 'zzz']


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
encoded_text_test = tfidf.transform(corpus_test)


## Train Test Split

In [27]:
labels = pd.get_dummies(df['Sentiment'])["Positive"].values

labels = labels.astype(int)

labels[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [28]:
x_train = encoded_text.toarray()
y_train = labels

## Model Train

In [32]:
mdl = MultinomialNB()

mdl.fit(x_train, y_train)

In [33]:
y_pred = mdl.predict(x_train)

y_pred

array([1, 1, 1, ..., 1, 1, 0])

In [34]:
y_train

array([1, 1, 1, ..., 1, 1, 1])

In [36]:
accuracy_score(y_pred, y_train)

0.9097461173625965

## Model Save

In [48]:
# Pickle the model
pickled_model = pickle.dumps(mdl)

# Write the pickled model to the file
with open('model.mdl', 'wb') as f:
    f.write(pickled_model)

In [52]:
joblib.dump(tfidf,'tfidf_sentiment_model.joblib')

['tfidf_sentiment_model.joblib']