## Building a model to perform sentiment analysis on tweets 

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("tweets.csv")

In [3]:
df.head()

Unnamed: 0,ID,GAME,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
df.isnull().sum()

ID             0
GAME           0
Sentiment      0
Tweet        858
dtype: int64

In [5]:
df["Sentiment"].value_counts()

Sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [6]:
## filling the missing values
df = df.fillna("")

In [7]:
df.isnull().sum()

ID           0
GAME         0
Sentiment    0
Tweet        0
dtype: int64

In [8]:
df["Sentiment"] = df["Sentiment"].map({
    "Positive": 0,
    "Negative": 1,
    "Neutral": 2,
    "Irrelevant": 3
})

In [9]:
df.head()

Unnamed: 0,ID,GAME,Sentiment,Tweet
0,2401,Borderlands,0,im getting on borderlands and i will murder yo...
1,2401,Borderlands,0,I am coming to the borders and I will kill you...
2,2401,Borderlands,0,im getting on borderlands and i will kill you ...
3,2401,Borderlands,0,im coming on borderlands and i will murder you...
4,2401,Borderlands,0,im getting on borderlands 2 and i will murder ...


In [10]:
df = df.drop(['ID',"GAME"],axis = 1)

In [11]:
df.head()

Unnamed: 0,Sentiment,Tweet
0,0,im getting on borderlands and i will murder yo...
1,0,I am coming to the borders and I will kill you...
2,0,im getting on borderlands and i will kill you ...
3,0,im coming on borderlands and i will murder you...
4,0,im getting on borderlands 2 and i will murder ...


In [12]:
## Getting all the stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aayushmalaviya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)## will only consider alphabets no numbers all numbers commas etc replaced by space
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words("english")]## performs stemming for the words not in stopwords
    stemmed_content = ' '.join(stemmed_content)##joins the words with a space in it
    return stemmed_content

In [16]:
df["Tweet"] = df["Tweet"].apply(stemming)


In [17]:
Y = df["Sentiment"]
np.unique(Y)

array([0, 1, 2, 3])

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df["Tweet"])
X = vectorizer.transform(df["Tweet"])


In [20]:
X_train,X_test,Y_train,Y_test =  train_test_split(X,Y,test_size = 0.2,random_state = 3,stratify = Y)

In [21]:
from xgboost import XGBClassifier

In [22]:
model = XGBClassifier(n_estimators = 128,objective="multi:softmax",num_class = len(np.unique(Y)))

In [23]:
model.fit(X_train,Y_train)

In [24]:
pred_y_xg = model.predict(X_test)
print(accuracy_score(pred_y_xg,Y_test))

0.6693445805717346


In [25]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
model2 = RandomForestClassifier(n_estimators = 128)
model2.fit(X_train,Y_train)

In [31]:
pred_y_rf = model2.predict(X_test)
print(accuracy_score(pred_y_rf,Y_test))

0.9120974760661444


## Accuracy score for the training set is 91.20%

In [32]:
pred_y_rf[0:10]

array([1, 2, 2, 2, 3, 0, 1, 3, 0, 0])

In [33]:
Y_test[0:10]

25685    1
64883    1
24890    2
6990     2
48385    3
63407    0
31252    1
69546    3
27169    0
71916    0
Name: Sentiment, dtype: int64

In [34]:
pred_y_rf_train = model2.predict(X_train)
print(accuracy_score(pred_y_rf_train,Y_train))

0.9667587245794628
