In [1]:
# Twitter Sentiment Analysis

In [60]:
# Python Imports

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

In [11]:
# Load the dataset

df = pd.read_csv("Sentiment.csv")

In [12]:
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [13]:
# Lets only take text and sentiment

df = df[["text", "sentiment"]]

In [14]:
df.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [15]:
## Data Cleaning

In [17]:
df.text[0]

'RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate'

In [100]:

lemmatizer = WordNetLemmatizer()

def clean_text(text):

    # lower
    text = text.lower()

    # remove \n
    text = " ".join(text.split("\n"))

    # remove url
    text = re.sub(r'https?:\S*', '', text)

    # remove twitter mentions accounts (@users)
    text = re.sub(r"@\S*", '', text)

    # remove non alpha words to preserve the !?
    text = re.sub(r"[^A-Za-z!?]", ' ', text)

    # remove extra spaces
    text = re.sub(r"\s+", " ", text)

    # remove stopwords
    custom_preseve_stopwords = ["no", "not", "never"]
    tokens = [ x for x in text.split(" ") if (x not in stopwords.words("english") or x in custom_preseve_stopwords) and len(x)>=2]

    # apply lemmatization
    tokens = [lemmatizer.lemmatize(x) for x in tokens]

    return " ".join(tokens)

In [101]:
clean_text(df.text[0]+"  no not never")

'rt everyone feel climate change question last night? exactly gopdebate no not never'

In [102]:
df.shape

(13871, 3)

In [103]:
df["clean_text"] = df["text"].apply(clean_text)

In [104]:
df

Unnamed: 0,text,sentiment,clean_text
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral,rt everyone feel climate change question last ...
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive,rt catch full gopdebate last night scott best ...
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral,rt no mention tamir rice gopdebate held clevel...
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive,rt carly fiorina trending hour debate men comp...
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive,rt gopdebate delivered highest rating history ...
...,...,...,...
13866,RT @cappy_yarbrough: Love to see men who will ...,Negative,rt love see men never faced pregnancy talk bod...
13867,RT @georgehenryw: Who thought Huckabee exceede...,Positive,rt thought huckabee exceeded expectation gopde...
13868,"RT @Lrihendry: #TedCruz As President, I will a...",Positive,rt tedcruz president always tell truth said wo...
13869,RT @JRehling: #GOPDebate Donald Trump says tha...,Negative,rt gopdebate donald trump say time political c...


In [105]:
# Split the dataset into training and test set
X = df["clean_text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [106]:
# Apply label encoder for sentiment.
encoder = LabelEncoder()


y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [107]:
# Apply tfidf vecotrier

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, stop_words='english')

X_train_ecoded =vectorizer.fit_transform(X_train)
X_test_encoded = vectorizer.transform(X_test)

In [108]:
vectorizer.get_feature_names_out()[:10]

array(['aaaand', 'abandon', 'abc', 'able', 'abortion',
       'abortion gopdebate', 'abraham', 'abraham lincoln', 'absolutely',
       'absolutely fearful'], dtype=object)

In [109]:
print("Vocabulary length:", len(vectorizer.get_feature_names_out()))

Vocabulary length: 5000


In [113]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from sklearn.metrics import accuracy_score, classification_report

In [115]:
model_params = {
    "LogisticRegression":LogisticRegression(),
    "SVC":SVC(),
    "MultiNomiial MNB": MultinomialNB(),
    "BernoulliNB":BernoulliNB()
}


In [116]:

for model_name, model in model_params.items():
    print("="*30)
    print("Name : ", model_name)

    model.fit(X_train_ecoded, y_train_encoded)
    y_pred = model.predict(X_test_encoded)

    ascore = accuracy_score(y_test_encoded, y_pred)
    print("Accuracy Score = ", ascore)
    print("\nClassification Report:\n")
    print(classification_report(y_test_encoded, y_pred, target_names=encoder.classes_))

Name :  LogisticRegression
Accuracy Score =  0.6954954954954955

Classification Report:

              precision    recall  f1-score   support

    Negative       0.74      0.90      0.81      1770
     Neutral       0.50      0.27      0.35       598
    Positive       0.61      0.44      0.51       407

    accuracy                           0.70      2775
   macro avg       0.62      0.54      0.56      2775
weighted avg       0.67      0.70      0.67      2775

Name :  SVC
Accuracy Score =  0.6969369369369369

Classification Report:

              precision    recall  f1-score   support

    Negative       0.72      0.92      0.81      1770
     Neutral       0.55      0.24      0.33       598
    Positive       0.66      0.40      0.50       407

    accuracy                           0.70      2775
   macro avg       0.64      0.52      0.55      2775
weighted avg       0.67      0.70      0.66      2775

Name :  MultiNomiial MNB
Accuracy Score =  0.6864864864864865

Classificati