## import libraries

In [1]:
import pandas as pd 
import numpy as np 
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df_train= pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv')
df_test= pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv')

## data preprocessing 

In [3]:
df_train 

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [4]:
df_train.rename(columns={'2401': 'id',
                   'Borderlands':'country',
                  'Positive': 'Label',
                  'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)

In [5]:
df_test 

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [6]:
df_test.rename(columns={'3364' : 'id',
                        'Facebook':'country',
                         'Irrelevant' : 'Label',
                         'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)

In [7]:
df_test

Unnamed: 0,id,country,Label,text
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [8]:
df_train.head(5)

Unnamed: 0,id,country,Label,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [9]:
df_train.shape

(74681, 4)

In [10]:
df_train.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       74681 non-null  int64 
 1   country  74681 non-null  object
 2   Label    74681 non-null  object
 3   text     73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [11]:
df_train.isnull().sum()

id           0
country      0
Label        0
text       686
dtype: int64

In [12]:
df_test.isnull().sum()

id         0
country    0
Label      0
text       0
dtype: int64

In [13]:
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

In [14]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

In [15]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [16]:
df_train['Preprocessed Text'] = df_train['text'].apply(preprocess) 


In [17]:
df_test['Preprocessed Text'] = df_test['text'].apply(preprocess) 


## Model Structure 

In [18]:
X_train = df_train['Preprocessed Text'] 
y_train = df_train.Label

X_test = df_test['Preprocessed Text']
y_test = df_test.Label

In [19]:
le_model = LabelEncoder()
df_train['Label'] = le_model.fit_transform(df_train['Label']) 

In [20]:
df_test['Label'] = le_model.fit_transform(df_test['Label']) 

In [21]:
v = TfidfVectorizer()

X_train_cv = v.fit_transform(X_train)
X_test_cv = v.transform(X_test)

In [22]:
RFC_model = RandomForestClassifier()

RFC_model.fit(X_train_cv, y_train)

In [23]:
y_pred = RFC_model.predict(X_test_cv)

In [24]:
print(accuracy_score(y_test, y_pred))


0.944944944944945


In [25]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

  Irrelevant       0.97      0.90      0.94       171
    Negative       0.94      0.95      0.94       266
     Neutral       0.92      0.96      0.94       285
    Positive       0.96      0.95      0.96       277

    accuracy                           0.94       999
   macro avg       0.95      0.94      0.94       999
weighted avg       0.95      0.94      0.94       999



In [None]:
#made with ❤️ by Ayush Mishra