In [57]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("twitter_training.csv")
print(df.shape)
df.head()

(74681, 4)


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


### Drop unwanted Columns

In [3]:
df.drop(columns = ["2401"],inplace=True)

In [4]:
df.head()

Unnamed: 0,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
df.Borderlands.value_counts()

Borderlands
Microsoft                            2400
MaddenNFL                            2400
TomClancysRainbowSix                 2400
LeagueOfLegends                      2394
CallOfDuty                           2394
Verizon                              2382
CallOfDutyBlackopsColdWar            2376
ApexLegends                          2376
Facebook                             2370
WorldOfCraft                         2364
Dota2                                2364
NBA2K                                2352
TomClancysGhostRecon                 2346
Battlefield                          2346
FIFA                                 2340
Overwatch                            2334
Xbox(Xseries)                        2334
johnson&johnson                      2328
Amazon                               2316
HomeDepot                            2310
PlayStation5(PS5)                    2310
CS-GO                                2304
Cyberpunk2077                        2304
GrandTheftAuto(GTA)   

In [6]:
df.Positive.value_counts()

Positive
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

### Renaming Columns

In [9]:
df.rename(columns = {'Borderlands':'Game_Name',
                     'Positive': 'Sentiment',
                     'im getting on borderlands and i will murder you all ,':'tweet'},inplace=True)

In [10]:
df.head()

Unnamed: 0,Game_Name,Sentiment,tweet
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


### Handling missing values

In [11]:
df.isna().sum()

Game_Name      0
Sentiment      0
tweet        686
dtype: int64

In [12]:
df = df.dropna()

In [13]:
df.isna().sum()

Game_Name    0
Sentiment    0
tweet        0
dtype: int64

### Create a new DataSet because 70000 sample is too much

In [14]:
df.Sentiment.value_counts()

Sentiment
Negative      22358
Positive      20654
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

In [15]:
df_positive = df[df.Sentiment == 'Positive'].iloc[:2000]
df_negative = df[df.Sentiment == 'Negative'].iloc[:2000]
df_neutral = df[df.Sentiment == 'Neutral'].iloc[:2000]
df_irr = df[df.Sentiment == 'Irrelevant'].iloc[:2000]


In [16]:
df2 = pd.concat([df_positive,df_negative,df_neutral,df_irr],axis=0)

In [17]:
df2.shape

(8000, 3)

In [18]:
df2.head()

Unnamed: 0,Game_Name,Sentiment,tweet
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


In [19]:
df2.Sentiment.value_counts()

Sentiment
Positive      2000
Negative      2000
Neutral       2000
Irrelevant    2000
Name: count, dtype: int64

### Categorical to Numerical data transformation

In [20]:
df2.Game_Name.value_counts()

Game_Name
CallOfDutyBlackopsColdWar    2343
Borderlands                  2279
Amazon                       1940
Overwatch                    1100
Xbox(Xseries)                 338
Name: count, dtype: int64

In [21]:
game_df = (pd.get_dummies(df2['Game_Name'])).astype(int)

In [22]:
game_df

Unnamed: 0,Amazon,Borderlands,CallOfDutyBlackopsColdWar,Overwatch,Xbox(Xseries)
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
10628,0,0,0,0,1
10629,0,0,0,0,1
10630,0,0,0,0,1
10631,0,0,0,0,1


In [23]:
df2.drop(columns = ['Game_Name'],inplace=True)

In [24]:
df2.head()

Unnamed: 0,Sentiment,tweet
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [25]:
df2.shape,game_df.shape

((8000, 2), (8000, 5))

### Text Preprocessing

In [27]:
nlp = spacy.load('en_core_web_sm')

In [28]:
def lemmatization(text):
    doc = nlp(text)
    lemmaList=[word.lemma_ for word in doc]
    return ' '.join(lemmaList)

In [29]:
df2['Lemma'] = df2['tweet'].apply(lemmatization)

In [30]:
def remove_stopwords(text):
    doc = nlp(text)
    no_stopwords = [word.text for word in doc if not word.is_stop and not word.is_punct]
    return ' '.join(no_stopwords)

In [31]:
df2['final'] = df2['Lemma'].apply(remove_stopwords)

In [32]:
df2.head()

Unnamed: 0,Sentiment,tweet,Lemma,final
0,Positive,I am coming to the borders and I will kill you...,"I be come to the border and I will kill you all ,",come border kill
1,Positive,im getting on borderlands and i will kill you ...,"I m get on borderland and I will kill you all ,",m borderland kill
2,Positive,im coming on borderlands and i will murder you...,I m come on borderland and I will murder you a...,m come borderland murder
3,Positive,im getting on borderlands 2 and i will murder ...,I m get on borderland 2 and I will murder you ...,m borderland 2 murder
4,Positive,im getting into borderlands and i can murder y...,I m get into borderland and I can murder you a...,m borderland murder


In [33]:
df2.drop(columns = ['tweet','Lemma'],inplace=True)

In [34]:
df2.head()

Unnamed: 0,Sentiment,final
0,Positive,come border kill
1,Positive,m borderland kill
2,Positive,m come borderland murder
3,Positive,m borderland 2 murder
4,Positive,m borderland murder


In [35]:
sentiment_df = pd.concat([df2,game_df],axis=1)

In [36]:
sentiment_df.head()

Unnamed: 0,Sentiment,final,Amazon,Borderlands,CallOfDutyBlackopsColdWar,Overwatch,Xbox(Xseries)
0,Positive,come border kill,0,1,0,0,0
1,Positive,m borderland kill,0,1,0,0,0
2,Positive,m come borderland murder,0,1,0,0,0
3,Positive,m borderland 2 murder,0,1,0,0,0
4,Positive,m borderland murder,0,1,0,0,0


In [37]:
sentiment_df.shape

(8000, 7)

In [38]:
x = sentiment_df.drop(columns = ['Sentiment'])
y = sentiment_df.Sentiment

In [39]:
x.head()

Unnamed: 0,final,Amazon,Borderlands,CallOfDutyBlackopsColdWar,Overwatch,Xbox(Xseries)
0,come border kill,0,1,0,0,0
1,m borderland kill,0,1,0,0,0
2,m come borderland murder,0,1,0,0,0
3,m borderland 2 murder,0,1,0,0,0
4,m borderland murder,0,1,0,0,0


In [40]:
y.head()

0    Positive
1    Positive
2    Positive
3    Positive
4    Positive
Name: Sentiment, dtype: object

### TF-IDF vectorizer on the column 'final'

In [42]:
tfidf = TfidfVectorizer()

In [43]:
tfidf.fit(x)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [45]:
tfidf_matrix = tfidf.transform(x['final']).toarray()

In [52]:
tfidf_matrix
print(tfidf_matrix.shape)

(8000, 7)


In [48]:
vectorizer_df = pd.DataFrame(tfidf_matrix,columns = tfidf.get_feature_names_out())

In [50]:
vectorizer_df.head()

Unnamed: 0,amazon,borderlands,callofdutyblackopscoldwar,final,overwatch,xbox,xseries
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
vectorizer_df.shape

(8000, 7)

In [53]:
x.drop(columns = ['final'],inplace=True)

In [54]:
X = pd.concat([x,vectorizer_df],axis=1)

In [55]:
X.head()

Unnamed: 0,Amazon,Borderlands,CallOfDutyBlackopsColdWar,Overwatch,Xbox(Xseries),amazon,borderlands,callofdutyblackopscoldwar,final,overwatch,xbox,xseries
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train test split

In [61]:
X = X.iloc[:8000]

In [62]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Building Model

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_jobs=1,
                               random_state=42,
                               n_estimators=200)

model.fit(x_train,y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [64]:
pred = model.predict(x_test)

In [65]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

  Irrelevant       0.86      0.41      0.55       399
    Negative       0.57      0.11      0.19       379
     Neutral       0.55      0.55      0.55       397
    Positive       0.43      0.94      0.59       425

    accuracy                           0.51      1600
   macro avg       0.60      0.50      0.47      1600
weighted avg       0.60      0.51      0.47      1600



In [66]:
pred[:3]

array(['Positive', 'Positive', 'Positive'], dtype=object)

In [67]:
y_test[:3]

1246    Negative
2892    Negative
4073    Positive
Name: Sentiment, dtype: object