In [13]:
#Load data-preprocessing libraries
import pandas as pd
import numpy as np

#Text processing libraries
import re
import nltk
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer #feature extraction

#Load data-visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#model building
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

#evaluation metrics
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix

nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

lemmatizer = WordNetLemmatizer() 

pd.pandas.set_option('display.max_columns',None)

[nltk_data] Downloading package stopwords to C:\Users\amulya
[nltk_data]     shetty\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Read the dataset
df=pd.read_csv('../Twitter_cleaned.csv',index_col='Unnamed: 0')
df.head()

Unnamed: 0,Sentiment,Tweet_word_count,Tweet_char_count,Tweet_clean,ApexLegends,AssassinsCreed,Battlefield,Borderlands,CS-GO,CallOfDuty,CallOfDutyBlackopsColdWar,Cyberpunk2077,Dota2,FIFA,Facebook,Fortnite,Google,GrandTheftAuto(GTA),Hearthstone,HomeDepot,LeagueOfLegends,MaddenNFL,Microsoft,NBA2K,Nvidia,Overwatch,PlayStation5(PS5),PlayerUnknownsBattlegrounds(PUBG),RedDeadRedemption(RDR),TomClancysGhostRecon,TomClancysRainbowSix,Verizon,WorldOfCraft,Xbox(Xseries),johnson&johnson
0,Positive,11,43,im getting borderland murder,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,Positive,12,40,coming border kill,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,Positive,10,41,im getting borderland kill,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,Positive,10,42,im coming borderland murder,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,Positive,12,46,im getting borderland murder,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
df.shape

(73996, 35)

In [7]:
#check for null values
df.isnull().sum()

Sentiment                               0
Tweet_word_count                        0
Tweet_char_count                        0
Tweet_clean                          1643
ApexLegends                             0
AssassinsCreed                          0
Battlefield                             0
Borderlands                             0
CS-GO                                   0
CallOfDuty                              0
CallOfDutyBlackopsColdWar               0
Cyberpunk2077                           0
Dota2                                   0
FIFA                                    0
Facebook                                0
Fortnite                                0
Google                                  0
GrandTheftAuto(GTA)                     0
Hearthstone                             0
HomeDepot                               0
LeagueOfLegends                         0
MaddenNFL                               0
Microsoft                               0
NBA2K                             

In [8]:
df.dropna(axis=0,how='any',inplace=True)

In [9]:
df.isnull().sum()

Sentiment                            0
Tweet_word_count                     0
Tweet_char_count                     0
Tweet_clean                          0
ApexLegends                          0
AssassinsCreed                       0
Battlefield                          0
Borderlands                          0
CS-GO                                0
CallOfDuty                           0
CallOfDutyBlackopsColdWar            0
Cyberpunk2077                        0
Dota2                                0
FIFA                                 0
Facebook                             0
Fortnite                             0
Google                               0
GrandTheftAuto(GTA)                  0
Hearthstone                          0
HomeDepot                            0
LeagueOfLegends                      0
MaddenNFL                            0
Microsoft                            0
NBA2K                                0
Nvidia                               0
Overwatch                

In [10]:
#Seperate dependent and independent features
X=df.loc[:,df.columns!='Sentiment']
y=df['Sentiment']

In [11]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_valid.shape, y_valid.shape)

Train (50647, 34) (50647,)
Test (21706, 34) (21706,)


In [15]:
vectorizer = TfidfVectorizer(stop_words='english') 
#TfidfVectorizer is a tool from scikit-learn that takes care of computing TF-IDF values for the terms in the text data.
features_train= vectorizer.fit_transform(X_train['Tweet_clean'])
#features_train and features_valid are the resulting TF-IDF matrices for the training and validation sets, respectively.
features_valid= vectorizer.transform(X_valid['Tweet_clean'])
#These matrices are typically used as input features for machine learning models, where the model can learn patterns based on the importance of different terms in each document.
features_train.shape, features_valid.shape

((50647, 31197), (21706, 31197))

In [16]:
#Function to fit and apply a model
def model_apply(model):
    #train the model
    model.fit(features_train,y_train)
    #make predictions
    pred=model.predict(features_valid)
    #model evaluation
    print(model)
    print('Accuracy score: ',accuracy_score(pred,y_valid))
    print('Weighted F1 score: ',f1_score(y_pred=pred,y_true=y_valid,average='weighted'))
    print('Confusion Matrix: \n',confusion_matrix(pred,y_valid))

In [17]:
#Multinomial Naive Bayes
nb=MultinomialNB()
model_apply(nb)

MultinomialNB()
Accuracy score:  0.7232562425135907
Weighted F1 score:  0.7121340682505694
Confusion Matrix: 
 [[1532   24   39   21]
 [1020 5909 1089  816]
 [ 247  210 3207  219]
 [ 961  461  900 5051]]


In [21]:
#Logistic Regression
lr=LogisticRegression(random_state=1,max_iter=500)
model_apply(lr)

LogisticRegression(max_iter=500, random_state=1)
Accuracy score:  0.7767898276974109
Weighted F1 score:  0.7755428736534683
Confusion Matrix: 
 [[2499  188  209  218]
 [ 422 5543  549  499]
 [ 321  439 3877  448]
 [ 518  434  600 4942]]


In [26]:
#Decision Tree
dtc=DecisionTreeClassifier(random_state=1)
model_apply(dtc)

DecisionTreeClassifier(random_state=1)
Accuracy score:  0.7810743573205565
Weighted F1 score:  0.7806003392282307
Confusion Matrix: 
 [[2614  244  246  274]
 [ 319 5417  427  430]
 [ 319  407 3945  425]
 [ 508  536  617 4978]]


In [27]:
#Random Forest
rf=RandomForestClassifier(random_state=1)
model_apply(rf)

RandomForestClassifier(random_state=1)
Accuracy score:  0.9021008016216714
Weighted F1 score:  0.9018757980368215
Confusion Matrix: 
 [[3107   50   56   58]
 [ 176 6161  215  218]
 [ 162  145 4655  173]
 [ 315  248  309 5658]]
