# Sentiment Analysis
2017-2022

In [1]:
import pandas as pd 
import numpy as np
import re 
import string
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS 
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df1 = pd.read_csv("2016_17 tweets.csv")
df2 = pd.read_csv("2017_18 tweets.csv")
df3 = pd.read_csv("2018_19 tweets.csv")
df4 = pd.read_csv("2019_20 tweets.csv")
df5 = pd.read_csv("2020_21 tweets.csv")
df6 = pd.read_csv("2021_22 tweets.csv")

df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)

# change string date to datetime
df["created_at"] = pd.to_datetime(df.created_at).dt.date
df.head()

Unnamed: 0,username,replied_to_username,location,verified,created_at,text
0,rxjjy96,,,False,2016-10-31,If Russell Westbrook don't get MVP this season...
1,duvaljr,,,False,2016-10-31,Nothing about Russell Westbrook game changed. ...
2,prince_nueve,,,False,2016-10-31,Russell Westbrook and Damian Lillard are ballers
3,MontezzAllen313,,Chicago,False,2016-10-31,Will Russell Westbrook win MVP this season?
4,tjvrd13,,,False,2016-10-31,"Russell Westbrook for MVP, homecoming king, pr..."


In [3]:
df.shape

(11979, 6)

In [4]:
# clean and stem words
def preprocess_text(text): 
    # get lowercase
    text = text.lower()
    # remove numbers
#     text = re.sub(r'\d+', '', text)
    # remove quotations
    text = text.replace("\"", "")
    text = text.replace("\'", "")
    text = text.replace("&", "")
    text = text.replace(".", "")
    text = text.replace("'", "")
    # remove urls
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)
    # remove punctuation
    text = text.replace("_", " ")
    text = text.translate(text.maketrans('', '', string.punctuation))    
    # strip whitespace
    text = text.strip()
    # remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    words = [w for w in tokens if not w in stop_words]
    text = " ".join(w for w in words)
    # stemming
#     stemmer= PorterStemmer()
#     tokens = word_tokenize(text)
#     words = [stemmer.stem(w) for w in tokens]
#     result = " ".join(w for w in words)
    return text #result

In [5]:
# create a new column for processed tweets
df['processed_tweet'] = [preprocess_text(tweet) for tweet in df['text'].values]
df.head()

Unnamed: 0,username,replied_to_username,location,verified,created_at,text,processed_tweet
0,rxjjy96,,,False,2016-10-31,If Russell Westbrook don't get MVP this season...,russell westbrook dont get mvp season imma cry
1,duvaljr,,,False,2016-10-31,Nothing about Russell Westbrook game changed. ...,nothing russell westbrook game changed playing...
2,prince_nueve,,,False,2016-10-31,Russell Westbrook and Damian Lillard are ballers,russell westbrook damian lillard ballers
3,MontezzAllen313,,Chicago,False,2016-10-31,Will Russell Westbrook win MVP this season?,russell westbrook win mvp season
4,tjvrd13,,,False,2016-10-31,"Russell Westbrook for MVP, homecoming king, pr...",russell westbrook mvp homecoming king presiden...


In [6]:
def get_polarity(tweet):
    blob = TextBlob(tweet)
    return blob.sentiment.polarity

In [7]:
df['polarity'] = df.processed_tweet.apply(lambda x: get_polarity(x))
df.head()

Unnamed: 0,username,replied_to_username,location,verified,created_at,text,processed_tweet,polarity
0,rxjjy96,,,False,2016-10-31,If Russell Westbrook don't get MVP this season...,russell westbrook dont get mvp season imma cry,0.0
1,duvaljr,,,False,2016-10-31,Nothing about Russell Westbrook game changed. ...,nothing russell westbrook game changed playing...,-0.4
2,prince_nueve,,,False,2016-10-31,Russell Westbrook and Damian Lillard are ballers,russell westbrook damian lillard ballers,0.0
3,MontezzAllen313,,Chicago,False,2016-10-31,Will Russell Westbrook win MVP this season?,russell westbrook win mvp season,0.8
4,tjvrd13,,,False,2016-10-31,"Russell Westbrook for MVP, homecoming king, pr...",russell westbrook mvp homecoming king presiden...,0.0


In [8]:
# t = [1 if p >0 else 0 for p in df['polarity']]
# s = [abs(p)*100+1 for p in df['polarity']]
# plt.figure(figsize=(25, 6))
# plt.scatter('created_at', 'polarity', data=df, c = t, s = s, alpha=0.7, cmap='jet')
# plt.xticks(df['created_at'])
# plt.xlabel('Week')
# plt.ylabel('Polarity')
# plt.show()

## VADER (Valence Aware Dictionary and Sentiment Reasoner)

In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [10]:
# create object
SIA = SentimentIntensityAnalyzer()

# add col to df
df['vaderPolarity'] = df.processed_tweet.apply(lambda x: SIA.polarity_scores(x)['compound'])

In [11]:
df.head()

Unnamed: 0,username,replied_to_username,location,verified,created_at,text,processed_tweet,polarity,vaderPolarity
0,rxjjy96,,,False,2016-10-31,If Russell Westbrook don't get MVP this season...,russell westbrook dont get mvp season imma cry,0.0,-0.4767
1,duvaljr,,,False,2016-10-31,Nothing about Russell Westbrook game changed. ...,nothing russell westbrook game changed playing...,-0.4,0.2023
2,prince_nueve,,,False,2016-10-31,Russell Westbrook and Damian Lillard are ballers,russell westbrook damian lillard ballers,0.0,0.0
3,MontezzAllen313,,Chicago,False,2016-10-31,Will Russell Westbrook win MVP this season?,russell westbrook win mvp season,0.8,0.5859
4,tjvrd13,,,False,2016-10-31,"Russell Westbrook for MVP, homecoming king, pr...",russell westbrook mvp homecoming king presiden...,0.0,0.0


# Flair pre-trained embedding-based model

In [12]:
from flair.models import TextClassifier
from flair.data import Sentence

In [13]:
flr = TextClassifier.load('en-sentiment')

def flair_prediction(x):
    sentence = Sentence(x)
    flr.predict(sentence)
    score = sentence.labels[0].score
    attitude = sentence.labels[0].value
    if attitude == "NEGATIVE":
        return -score
    else:
        return score

2022-11-18 13:21:53,993 loading file /home/bdnassif/.flair/models/sentiment-en-mix-distillbert_4.pt


In [14]:
%%time
#subset = df.iloc[50:75, :].copy()

# WARNING: takes about 37 minutes to run
#df['flair'] = df.processed_tweet.apply(lambda x: flair_prediction(x))

CPU times: user 4h 50min 3s, sys: 4min 46s, total: 4h 54min 50s
Wall time: 36min 52s


In [15]:
df.head()

Unnamed: 0,username,replied_to_username,location,verified,created_at,text,processed_tweet,polarity,vaderPolarity,flair
0,rxjjy96,,,False,2016-10-31,If Russell Westbrook don't get MVP this season...,russell westbrook dont get mvp season imma cry,0.0,-0.4767,-0.989227
1,duvaljr,,,False,2016-10-31,Nothing about Russell Westbrook game changed. ...,nothing russell westbrook game changed playing...,-0.4,0.2023,-0.953636
2,prince_nueve,,,False,2016-10-31,Russell Westbrook and Damian Lillard are ballers,russell westbrook damian lillard ballers,0.0,0.0,0.995221
3,MontezzAllen313,,Chicago,False,2016-10-31,Will Russell Westbrook win MVP this season?,russell westbrook win mvp season,0.8,0.5859,0.999352
4,tjvrd13,,,False,2016-10-31,"Russell Westbrook for MVP, homecoming king, pr...",russell westbrook mvp homecoming king presiden...,0.0,0.0,-0.88653


## Twitter-roBERTa-base (from hugging face)
https://huggingface.co/blog/sentiment-analysis-python

In [4]:
from transformers import pipeline

In [5]:
model = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")

Downloading:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

In [31]:
%%time

data = df.processed_tweet.tolist()

# WARNING: will take about 42 minutes to run
roberta = model(data)

CPU times: user 6h 40min 9s, sys: 4min 25s, total: 6h 44min 34s
Wall time: 50min 35s


In [32]:
df['roberta'] = None

# get pos, neg, neu from roberta
rob_sentiment = [twt['label'] for twt in roberta]

df['roberta'] = rob_sentiment

In [33]:
df.head()

Unnamed: 0,username,replied_to_username,location,verified,created_at,text,processed_tweet,polarity,vaderPolarity,flair,roberta,"(0, roberta)"
0,rxjjy96,,,False,2016-10-31,If Russell Westbrook don't get MVP this season...,russell westbrook dont get mvp season imma cry,0.0,-0.4767,-0.989227,NEG,NEG
1,duvaljr,,,False,2016-10-31,Nothing about Russell Westbrook game changed. ...,nothing russell westbrook game changed playing...,-0.4,0.2023,-0.953636,NEG,NEG
2,prince_nueve,,,False,2016-10-31,Russell Westbrook and Damian Lillard are ballers,russell westbrook damian lillard ballers,0.0,0.0,0.995221,NEU,NEG
3,MontezzAllen313,,Chicago,False,2016-10-31,Will Russell Westbrook win MVP this season?,russell westbrook win mvp season,0.8,0.5859,0.999352,POS,NEG
4,tjvrd13,,,False,2016-10-31,"Russell Westbrook for MVP, homecoming king, pr...",russell westbrook mvp homecoming king presiden...,0.0,0.0,-0.88653,NEU,NEG



# Compare sentiment libraries

In [30]:
for _, row in df.iterrows():
    print("-------")
    print(row['text'])
    print(f"TextBlob: {row['polarity']}")
    print(f"Vader: {row['vaderPolarity']}")
    print(f"Flair: {row['flair']}")
    print(f"Roberta: {row['roberta']}")
    print("-------")

-------
If Russell Westbrook don't get MVP this season imma cry
TextBlob: 0.0
Vader: -0.4767
Flair: -0.9892271757125854
Roberta: NEG
-------
-------
Nothing about Russell Westbrook game changed. He has been playing this for years. Bandwagon'rs!!!
TextBlob: -0.4
Vader: 0.2023
Flair: -0.9536364674568176
Roberta: NEG
-------
-------
Russell Westbrook and Damian Lillard are ballers
TextBlob: 0.0
Vader: 0.0
Flair: 0.9952210783958435
Roberta: NEU
-------
-------
Will Russell Westbrook win MVP this season?
TextBlob: 0.8
Vader: 0.5859
Flair: 0.9993522763252258
Roberta: POS
-------
-------
Russell Westbrook for MVP, homecoming king, president, and whatever else
TextBlob: 0.0
Vader: 0.0
Flair: -0.8865303993225098
Roberta: NEU
-------
-------
After a historic start of the season for Russell Westbrook, was Kevin Durant the one holding him back this whole time?
TextBlob: 0.06666666666666667
Vader: 0.0
Flair: 0.9941884279251099
Roberta: NEU
-------
-------
Russell Westbrook = MVP ???
TextBlob: 0.0
V

## Export Data to Excel for Graphing in PowerPoint

In [7]:
exportName = "sentiment_analysis.xlsx"

#df.to_excel(exportName, index=False)

In [8]:
import pandas as pd
df = pd.read_excel("sentiment_analysis.xlsx")

In [9]:
df.head()

Unnamed: 0,username,replied_to_username,location,verified,created_at,text,processed_tweet,polarity,vaderPolarity,flair,roberta
0,rxjjy96,,,False,2016-10-31,If Russell Westbrook don't get MVP this season...,russell westbrook dont get mvp season imma cry,0.0,-0.4767,-0.989227,NEG
1,duvaljr,,,False,2016-10-31,Nothing about Russell Westbrook game changed. ...,nothing russell westbrook game changed playing...,-0.4,0.2023,-0.953636,NEG
2,prince_nueve,,,False,2016-10-31,Russell Westbrook and Damian Lillard are ballers,russell westbrook damian lillard ballers,0.0,0.0,0.995221,NEU
3,MontezzAllen313,,Chicago,False,2016-10-31,Will Russell Westbrook win MVP this season?,russell westbrook win mvp season,0.8,0.5859,0.999352,POS
4,tjvrd13,,,False,2016-10-31,"Russell Westbrook for MVP, homecoming king, pr...",russell westbrook mvp homecoming king presiden...,0.0,0.0,-0.88653,NEU
