<a href="https://colab.research.google.com/github/2003Yash/twitter-sentiment/blob/main/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create Environment

In [None]:
import tweepy as tw
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Get Twitter Tokens and Keys

In [None]:
# got them from twitter devolper account -> projects/app -> keys / tokens
consumer_key = '-'
consumer_secret = '-'
access_token = '-'
access_token_secret = '-'

Authenticate with Twitter

In [None]:
# Authenticate
auth = tw.OAuthHandler(consumer_key, consumer_secret)
# Set Tokens
auth.set_access_token(access_token, access_token_secret)
# Instantiate API
api = tw.API(auth, wait_on_rate_limit=True)

Get Tweets From Twitter

In [None]:
# now twitter removed permission for free devolper accounts

hashtag = "#PresidentialDebate"
query = tw.Cursor(api.search_tweets, q=hashtag).items(1000) #get 1000 tweets on given hashtag
tweets = [{'Tweet':tweet.text, 'Timestamp':tweet.created_at} for tweet in query] #format the twwets
print(tweets)

In [None]:
#create a dataframe from tweets
df = pd.DataFrame.from_dict(tweets)
df.head()

In [None]:
df.shape

In [None]:
# create reference array with keywords to understand the canditate context of each tweet
trump_handle = ['DonaldTrump', 'Donald Trump', 'Donald', 'Trump', 'Trump\'s']
biden_handle = ['JoeBiden', 'Joe Biden', 'Joe', 'Biden', 'Biden\'s']

One-Hot encode the tweets df with 2 new columns


In [None]:
def identify_subject(tweet, refs): #takes the tweets and ref arrays and flags their referece
    flag = 0
    for ref in refs:
        if tweet.find(ref) != -1:
            flag = 1
    return flag

df['Trump'] = df['Tweet'].apply(lambda x: identify_subject(x, trump_handle)) #we create new col trump and we flag all tweets
df['Biden'] = df['Tweet'].apply(lambda x: identify_subject(x, biden_handle)) #we create new col biden and we flag all tweets and lambda helps to iterate through all tweets

In [None]:
df.head(10)

Pre-Process using NLTK

In [None]:
# Import stopwords
import nltk
from nltk.corpus import stopwords

# Import textblob
from textblob import Word, TextBlob #helps in sentiment analysis

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
custom_stopwords = ['RT', '#PresidentialDebate']  # jsut an array of words, we also use them as stopwords and remove from the data

In [None]:
def preprocess_tweets(tweet, custom_stopwords):
    processed_tweet = tweet
    processed_tweet.replace('[^\w\s]', '') # removes [^\w\s] with " "
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in stop_words) # removes stop words
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in custom_stopwords) # removes custom stop words
    processed_tweet = " ".join(Word(word).lemmatize() for word in processed_tweet.split())  # lemmatization
    return(processed_tweet) #returns processed tweet

df['Processed Tweet'] = df['Tweet'].apply(lambda x: preprocess_tweets(x, custom_stopwords)) # use lambda to all tweets and add col processed tweets and add procssed tweets using above function

In [None]:
df.head()

In [None]:
print('Base review\n', df['Tweet'][0])
print('\n------------------------------------\n')
print('Cleaned and lemmatized review\n', df['Processed Tweet'][0])

SENTIMENT ANALYSIS

In [None]:
df['polarity'] = df['Processed Tweet'].apply(lambda x: TextBlob(x).sentiment[0]) # using lambda to do all x, converting x into textblob object, using sentiment to get sentiment, sentiment[0] means polarity
df['subjectivity'] = df['Processed Tweet'].apply(lambda x: TextBlob(x).sentiment[1]) # sentiment[1] means subjectivity
#polairty means categorizing the sentiment as positive, negative, or neutral and subjectivity means a way to measure how much of a text is based on personal opinion versus factual information. A higher subjectivity score indicates that the text contains more personal opinion.
df[['Processed Tweet', 'Biden', 'Trump', 'polarity', 'subjectivity']].head() #printing only desired colmns

In [None]:
display(df[df['Trump']==1][['Trump','polarity','subjectivity']].groupby('Trump').agg([np.mean, np.max, np.min, np.median])) # sentiment staristics of trump for both polarity and subjectivity
df[df['Biden']==1][['Biden','polarity','subjectivity']].groupby('Biden').agg([np.mean, np.max, np.min, np.median]) # sentiment statistics of biden for both polarity and subjectivity

VISUALIZE THE SENTIMENT

In [None]:
biden = df[df['Biden']==1][['Timestamp', 'polarity']]
biden = biden.sort_values(by='Timestamp', ascending=True)
biden['MA Polarity'] = biden.polarity.rolling(10, min_periods=3).mean() #plotting avg of 10 tweents

trump = df[df['Trump']==1][['Timestamp', 'polarity']]
trump = trump.sort_values(by='Timestamp', ascending=True)
trump['MA Polarity'] = trump.polarity.rolling(10, min_periods=3).mean()

In [None]:
trump.head()

In [None]:
repub = 'red'
demo = 'blue'
fig, axes = plt.subplots(2, 1, figsize=(13, 10))

#plotting trump and biden moving average polarity of 10 tweets

axes[0].plot(biden['Timestamp'], biden['MA Polarity'])
axes[0].set_title("\n".join(["Biden Polarity"]))
axes[1].plot(trump['Timestamp'], trump['MA Polarity'], color='red')
axes[1].set_title("\n".join(["Trump Polarity"]))

fig.suptitle("\n".join(["Presidential Debate Analysis"]), y=0.98)

plt.show()