# Analyzing Joe Biden's and Donald Trump's tweets during the coronavirus pandemic.

# Table of Contents

* [1. Importing Packages](#importing_packages)
* [2. Loading the Data](#load_data)
* [3. Data Cleaning and Preparation](#data_clean)
* [4. Exploring the Data: Aggregate and Frequency statistics](#explore_data)
* [5. Sentiment Analysis](#sentiment_analysis)
* [6. Topic Modeling](#topic_modeling)
* [7. Classification](#classification)

# 1. Importing packages <a class="anchor" id="importing_packages">

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import collections
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk import everygrams
from nltk import ngrams
from nltk import FreqDist
from nltk.tokenize.treebank import TreebankWordDetokenizer
from textblob import TextBlob
from wordcloud import WordCloud
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
pd.options.mode.chained_assignment = None
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 2. Load Data<a class="anchor" id="load_data">

In [9]:
trump_df = pd.read_csv("trump.csv")
biden_df = pd.read_csv("biden.csv")

In [10]:
trump_df

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,RT @USNoodlesA: 🔥This amazing American @TheLeo...,08-09-2020 02:30:54,12113,0,True,1292287207533936645
1,RT @drawandstrike: ADDENDUM:This is 100% corre...,08-09-2020 02:26:15,8666,0,True,1292286038518280192
2,RT @hale_razor: Just think of it as DACA for A...,08-09-2020 02:24:25,9198,0,True,1292285575454691330
3,RT @KurtSchlichter: Why is the DACA executive ...,08-09-2020 02:21:58,6692,0,True,1292284959533793282
4,RT @marklevinshow: 1. President Trump had no c...,08-09-2020 02:21:32,19090,0,True,1292284850125316096
...,...,...,...,...,...,...
7028,I will be signing our very large and comprehen...,12-31-2019 14:16:40,22730,95208,False,1212014713808273410
7029,President Putin of Russia called to thank me a...,12-31-2019 14:06:09,29025,141918,False,1212012065440894976
7030,Armed congregants quickly stopped a crazed chu...,12-31-2019 13:53:10,25719,114701,False,1212008798849814528
7031,Iran killed an American contractor wounding ma...,12-31-2019 12:02:47,37287,150318,False,1211981022084128768


In [11]:
biden_df

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,We can’t let Donald Trump destroy the U.S. Pos...,2020-08-09 19:15:00,10158,54082,False,1292539896465416192
1,It's been six years since Michael Brown's life...,2020-08-09 17:00:00,6281,34913,False,1292505925685633025
2,Our planet can’t take four more years of Donal...,2020-08-09 15:00:02,17283,102639,False,1292475731172261894
3,RT @TeamJoe: We seriously can’t wait to see wh...,2020-08-09 14:24:10,797,0,True,1292466709006344193
4,Giving the Trump administration another four y...,2020-08-09 13:00:00,10095,40664,False,1292445526005252096
...,...,...,...,...,...,...
3195,RT @TeamJoe: Let's clear up the confusion abou...,2019-07-31 13:22:52,248,0,True,1156555830222831617
3196,RT @TeamJoe: A medida que los ricos se hacen m...,2019-07-31 02:13:18,54,0,True,1156387329663258624
3197,"RT @TeamJoe: As the rich get richer, middle-cl...",2019-07-31 02:13:16,165,0,True,1156387322587488257
3198,RT @TeamJoe: President Trump doesn’t get the b...,2019-07-31 01:57:35,215,0,True,1156383375030214657


# 3. Data Cleaning and Preparation<a class="anchor" id="data_clean">

### Functions for text cleaning

In [12]:
global stop_words
stop_words = stopwords.words('english')
custom_stopwords = ['amp', 'youre', 'dont','wont', 'got']
stop_words.extend(custom_stopwords)

def detect_language(X):
    from langdetect import detect
    try:
        lang = detect(X)
        return(lang)
    except:
        return("other")
    
def remove_url_punctuation(X):
    """ Replace URLS, punctuations, hashtags found in a text string with nothing.
    Change to lowercase"""
    # Try with just simple /w+ regex.       
    url_pattern = re.compile(r'https:?://\S+|www\.\S+')
    replace_url = url_pattern.sub(r'', str(X))
    punct_pattern = re.compile(r'[^\w\s]')
    no_punct = punct_pattern.sub(r'', replace_url).lower()
    no_punct = no_punct.replace('\n', ' ')
    no_punct = no_punct.replace('\t', ' ')
    return no_punct

def split_words(X):
    """"" Split tweets into words for NLP"""
    split_word_list = X.split(" ")
    return split_word_list

def remove_stopwords(X):
    filtered_words = []
    global stop_words
    for word in X:
        if word not in stop_words and len(word) > 2 and word != 'nan':
            filtered_words.append(word)
    return filtered_words

# All the above in one function
def basic_processing(text):
    clean_text = remove_url_punctuation(text)
    tokens = split_words(clean_text)
    tokens = remove_stopwords(tokens) 
    return tokens

### Get Biden tweets since the emergence of [COVID19](https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200121-sitrep-1-2019-ncov.pdf). Trump dataset already filtered by start date. 

In [13]:
START_DATE = pd.to_datetime('31-12-2019')
biden_df['created_at'] = pd.to_datetime(biden_df['created_at'])
trump_df['created_at'] = pd.to_datetime(trump_df['created_at'])
biden_df = biden_df.loc[biden_df['created_at'] >= START_DATE]

### Remove retweets

In [15]:
# Note: Not using is_retweet column since data incorrectly labelled from trumptwitterarchive. 
# Using starts with 'RT' is more reliable.
trump_retweets = trump_df[trump_df.text.str.startswith('RT ') == True]
biden_retweets = biden_df[biden_df.text.str.startswith('RT ') == True]
print('Trump has', len(trump_retweets), 'retweets.', round(len(trump_retweets)/len(trump_df) * 100, 2), '% of his tweets are retweets.')
print('Biden has', len(biden_retweets), 'retweets.', round(len(biden_retweets)/len(biden_df) * 100,2), '% of his tweets are retweets.')

Trump has 3843 retweets. 54.64 % of his tweets are retweets.
Biden has 73 retweets. 3.79 % of his tweets are retweets.


In [8]:
trump_df = trump_df[trump_df.text.str.startswith('RT ') == False]
biden_df = biden_df[biden_df.text.str.startswith('RT ') == False]

### Remove non-English tweets

In [None]:
trump_df['en'] = trump_df['text'].apply(detect_language)
biden_df['en'] = biden_df['text'].apply(detect_language)

trump_df = trump_df[trump_df['en'] == 'en']
biden_df = biden_df[biden_df['en'] == 'en']

### Remove punctuation, special characters, and hashtags in tweets

In [None]:
trump_df['clean_text'] = trump_df['text'].apply(remove_url_punctuation)
biden_df['clean_text'] = biden_df['text'].apply(remove_url_punctuation)
print(trump_df['text'].head())
print('-------------------------------------')
print(trump_df['clean_text'].head())

### Tokenize Tweets

In [None]:
trump_df['tokens'] = trump_df['clean_text'].apply(split_words)
biden_df['tokens'] = biden_df['clean_text'].apply(split_words)
print(trump_df['clean_text'].head())
print('-------------------------------------')
print(trump_df['tokens'].head())

### Remove Stop Words with NLTK

In [None]:
trump_df['tokens'] = trump_df['tokens'].apply(remove_stopwords)
biden_df['tokens'] = biden_df['tokens'].apply(remove_stopwords)
print(trump_df['tokens'].head())

In [None]:
trump_df

In [None]:
biden_df

# 4. Exploring the Data: Aggregate and Frequency Statistics<a class="anchor" id="explore_data">

In [None]:
trump_df.describe()

In [None]:
biden_df.describe()

## How engaging are their tweets?: Viewing the average number of retweets and favorites. 

#### Trump

In [None]:
trump_rt_avg = trump_df.mean()['retweet_count']
print("Trump retweet average:", trump_rt_avg)

In [None]:
trump_favorite_avg = trump_df.mean()['favorite_count']
print("Trump favorite average:", trump_favorite_avg)

In [None]:
trump_most_rts = trump_df.loc[trump_df['retweet_count'].idxmax()].text
print("Trump's most retweeted and favorited tweet:", trump_most_rts)

#### Biden

In [None]:
biden_rt_avg = biden_df.mean()['retweet_count']
print("Biden retweet average:", biden_rt_avg)

In [None]:
biden_favorite_avg = biden_df.mean()['favorite_count']
print("Biden favorite average:", biden_favorite_avg)

In [None]:
biden_most_rts = biden_df.loc[biden_df['retweet_count'].idxmax()].text
print("Biden's most retweeted and favorited tweet:", biden_most_rts)

In [None]:
# Calculate how much more retweets and favorites trump gets on average than Biden
print("Trump gets", round(trump_rt_avg / biden_rt_avg, 2), "times more retweets than Biden." )
print("That's an increase of", round(100 * (trump_rt_avg - biden_rt_avg) / biden_rt_avg, 2) ,"%")

In [None]:
print("Trump gets", round(trump_favorite_avg / biden_favorite_avg, 2), "times more favorites than Biden." )
print("That's an increase of", round(100 * (trump_favorite_avg - biden_favorite_avg) / biden_favorite_avg, 2) ,"%")

## Looks like Trump gets much more retweets and favorites than Biden. Let's use boxplots to visualize the differences more clearly. 

In [None]:
plt.style.use('seaborn-notebook')
plt.figure(figsize=(6, 10))
plt.boxplot([trump_df['retweet_count'], biden_df['retweet_count']], labels=['Trump', 'Biden'], autorange=False)

plt.ylabel('Retweets')
plt.title('Trump vs Biden Retweets')

In [None]:
plt.figure(figsize=(6, 10))
plt.boxplot([trump_df['favorite_count'], biden_df['favorite_count']], labels=['Trump', 'Biden'], autorange=False)
plt.ylabel('Favorites')
plt.title('Trump vs Biden Favorites')

The box plots clearly show that on average Biden has less engagement than Trump on Twitter, except for his one outlier tweet that went viral. 

## Which words and n-grams do they use the most?

### Unigrams

In [None]:
trump_tokens = trump_df['tokens'].explode()
biden_tokens = biden_df['tokens'].explode()

trump_unigrams = FreqDist(ngrams(trump_tokens, 1))
biden_unigrams = FreqDist(ngrams(biden_tokens, 1))

In [None]:
plt.style.use('seaborn')
NUM_NGRAMS = 20

pd.Series(ngrams(trump_tokens, 1)).value_counts()[:NUM_NGRAMS].sort_values().plot.barh(color='orangered', width=.7, figsize=(12,8))
plt.title("Trump's Top 20 Unigrams")
plt.ylabel('Unigram')
plt.xlabel('# of occurrences')
trump_unigrams.most_common(NUM_NGRAMS)

In [None]:
pd.Series(ngrams(biden_tokens, 1)).value_counts()[:NUM_NGRAMS].sort_values().plot.barh(color='dodgerblue', width=.7, figsize=(12,8))
plt.title("Biden's Top 20 Unigrams")
plt.ylabel('Unigram')
plt.xlabel('# of occurrences')
biden_unigrams.most_common(NUM_NGRAMS)

### Bigrams

In [None]:
trump_bigrams = FreqDist(ngrams(trump_tokens, 2))
biden_bigrams = FreqDist(ngrams(biden_tokens, 2))

In [None]:
pd.Series(ngrams(trump_tokens, 2)).value_counts()[:NUM_NGRAMS].sort_values().plot.barh(color='orangered', width=.7, figsize=(12,8))
plt.title("Trump's Top 20 Bigrams")
plt.ylabel('Bigram')
plt.xlabel('# of occurrences')
trump_bigrams.most_common(NUM_NGRAMS)

In [None]:
pd.Series(ngrams(biden_tokens, 2)).value_counts()[:NUM_NGRAMS].sort_values().plot.barh(color='dodgerblue', width=.7, figsize=(12,8))
plt.title("Bidens's Top 20 Bigrams")
plt.ylabel('Bigram')
plt.xlabel('# of occurrences')
biden_bigrams.most_common(NUM_NGRAMS)

### Trigrams

In [None]:
trump_trigrams = FreqDist(ngrams(trump_tokens, 3))
biden_trigrams = FreqDist(ngrams(biden_tokens, 3))

In [None]:
pd.Series(ngrams(trump_tokens, 3)).value_counts()[:NUM_NGRAMS].sort_values().plot.barh(color='orangered', width=.7, figsize=(12,8))
plt.title("Trump's Top 20 Trigrams")
plt.ylabel('Trigram')
plt.xlabel('# of occurrences')
trump_trigrams.most_common(NUM_NGRAMS)

In [None]:
pd.Series(ngrams(biden_tokens, 3)).value_counts()[:NUM_NGRAMS].sort_values().plot.barh(color='dodgerblue', width=.7, figsize=(12,8))
plt.title("Biden's Top 20 Trigrams")
plt.ylabel('Trigram')
plt.xlabel('# of occurrences')
biden_trigrams.most_common(NUM_NGRAMS)

### 4-5 grams

In [None]:
pd.Series(everygrams(trump_tokens, min_len= 4, max_len=5)).value_counts()[:NUM_NGRAMS].sort_values().plot.barh(color='orangered', width=.7, figsize=(12,8))
plt.title("Trump's Top 20 4-5grams")
plt.ylabel('4-5grams')
plt.xlabel('# of occurrences')

trump_4_5_grams = FreqDist(everygrams(trump_tokens, min_len= 4, max_len=5))
trump_4_5_grams.most_common(NUM_NGRAMS)

In [None]:
pd.Series(everygrams(biden_tokens, min_len= 4, max_len=5)).value_counts()[:NUM_NGRAMS].sort_values().plot.barh(color='dodgerblue', width=.7, figsize=(12,8))
plt.title("Biden's Top 20 4-5grams")
plt.ylabel('4-5gram')
plt.xlabel('# of occurrences')

biden_4_5_grams = FreqDist(everygrams(biden_tokens, min_len= 4, max_len=5)).most_common(NUM_NGRAMS)

From the above n-grams, it seems that Biden talks a lot about gun violence, healthcare, and climate change, while Trump talks a lot about fake news, whitehouse conferences, endorsements and republican party ratings. They also both mention each other a lot. Question is, how much? 

## How many times do they mention each other?

In [None]:
trump_tweets_contain_biden = trump_df[trump_df['text'].str.lower().str.contains("joe|biden")]
print(len(trump_tweets_contain_biden), "Trump tweets mention Joe Biden.")
trump_mention_biden_percent = round(100 * len(trump_tweets_contain_biden)/ len(trump_df), 2)
print("That means", trump_mention_biden_percent,  "% of Trump tweets mention Joe Biden")

In [None]:
biden_tweets_contain_trump = biden_df[biden_df['text'].str.lower().str.contains("trump|donald")]
print(len(biden_tweets_contain_trump), "Biden tweets mention Donald Trump.")
biden_mention_trump_percent = round(100 * len(biden_tweets_contain_trump)/ len(biden_df), 2)
print("That means", biden_mention_trump_percent,  "% of Biden tweets mention Donald Trump")

## How many times do they mention the coronavirus? 

In [None]:
corona_regex = "covid|corona|virus|pandemic"
biden_tweets_relate_corona = biden_df[biden_df['text'].str.lower().str.contains(corona_regex)]
print(len(biden_tweets_relate_corona))
biden_tweets_relate_corona_percent = round(100 * len(biden_tweets_relate_corona)/ len(biden_df), 2)
print("That means", biden_tweets_relate_corona_percent,  "% of Biden tweets mention coronavirus")

In [None]:
trump_tweets_relate_corona = trump_df[trump_df['text'].str.lower().str.contains(corona_regex)]
print(len(trump_tweets_relate_corona))
trump_tweets_relate_corona_percent = round(100 * len(trump_tweets_relate_corona)/ len(trump_df), 2)
print("That means", trump_tweets_relate_corona_percent,  "% of Trump tweets mention coronavirus")

Biden tweets about the coronavirus nearly twice as much as Trump. Also while only ~7% of Trump tweets mention Joe Biden, nearly 31% of Biden tweets mention Trump.

# How frequently do they tweet? Has the frequency of tweets changed as the pandemic evolved?

### Trump

In [None]:
trump_tweet_by_date = trump_df.groupby(by=trump_df['created_at'].dt.date).count()['text']
plt.plot(trump_tweet_by_date.index.values, trump_tweet_by_date, color='orangered')
plt.xlabel('Date')
plt.ylabel('Tweets')
plt.title('Number of Trump Tweets since December 31 2019')
plt.show()

### Biden

In [None]:
biden_tweet_by_date = biden_df.groupby(by=biden_df['created_at'].dt.date).count()['text']
plt.plot(biden_tweet_by_date.index.values, biden_tweet_by_date, color='dodgerblue')
plt.xlabel('Date')
plt.ylabel('Tweets')
plt.title('Number of Biden Tweets since December 31 2019')
plt.show()

### Both candidates on the same figure

In [None]:
plt.plot(trump_tweet_by_date.index.values, trump_tweet_by_date, 'orangered')
plt.plot(biden_tweet_by_date.index.values, biden_tweet_by_date, 'dodgerblue')
plt.xlabel('Date')
plt.ylabel('Tweets')
plt.title('Tweets since December 31 2019 ')
plt.legend(['Trump', 'Biden'])
plt.show()

From the graphs above, it seems that there are peaks and valleys. Future work would be to investigate the dates with more tweets than normal to see if those are linked with news events. Moreover, the variance in the number of tweets brings up the question if they tweet more on certain days over others?

## Do they prefer to tweet on certain days over others?

In [None]:
weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

trump_tweets_by_weekday = trump_df.groupby(by=trump_df['created_at'].dt.day_name()).count().reindex(weekdays)['text']
plt.bar(trump_tweets_by_weekday.index.values, trump_tweets_by_weekday, color='orangered')
plt.xlabel('Day')
plt.ylabel('Tweets')
plt.title('Trump Tweets by Weekday')
plt.show()

In [None]:
biden_tweets_by_weekday = biden_df.groupby(by=biden_df['created_at'].dt.day_name()).count().reindex(weekdays)['text']
plt.bar(biden_tweets_by_weekday.index.values, biden_tweets_by_weekday, color='dodgerblue')
plt.xlabel('Day')
plt.ylabel('Tweets')
plt.title('Biden Tweets by Weekday')
plt.show()

It seems that Biden tends to tweet most during the middle of the week and tapers off on the weekend. Whereas Trump tends to tweet consistently with a tendency to tweet more on Thursdays and Fridays.

# 5. Sentiment Analysis<a class="anchor" id="sentiment_analysis">

## Get polarity and subjectivity scores for candidate tweets

In [None]:
# Measure subjectivity of a text. Subjectivity is in the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Use polarity as a measure for sentiment. Polarity is in the range [-1.0, 1.0] where -1 is negative and +1 is positive sentiment
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
trump_df['Subjectivity'] = trump_df['clean_text'].apply(get_subjectivity)
trump_df['Polarity'] = trump_df['clean_text'].apply(get_polarity)

biden_df['Subjectivity'] = biden_df['clean_text'].apply(get_subjectivity)
biden_df['Polarity'] = biden_df['clean_text'].apply(get_polarity)

In [None]:
print(trump_df['Polarity'])
print(trump_df['Subjectivity'])

## Assign positive, neutral, and negative labels to tweets based on polarity

In [None]:
def get_sentiment(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

In [None]:
trump_df['Sentiment'] = trump_df['Polarity'].apply(get_sentiment)
biden_df['Sentiment'] = biden_df['Polarity'].apply(get_sentiment)

In [None]:
trump_df['Sentiment']

## Most Positive and Negative Tweets

### Trump

In [None]:
trump_most_positive = trump_df.nlargest(10, 'Polarity')
print("Trump's top", len(trump_most_negative), "most positive tweets" )

for idx, tweet in enumerate(trump_most_positive['text']):
    print(idx, tweet)
    print()

In [None]:
trump_most_negative = trump_df.nsmallest(10, 'Polarity')
print("Trump's top", len(trump_most_negative), "most negative tweets" )

for idx, tweet in enumerate(trump_most_negative['text']):
    print(idx, tweet)
    print()

### Biden

In [None]:
biden_most_positive = biden_df.nlargest(10, 'Polarity')
print("Biden's top", len(biden_most_positive), "most positive tweets" )

for idx, tweet in enumerate(biden_most_positive['text']):
    print(idx, tweet)
    print()

In [None]:
biden_most_negative = biden_df.nsmallest(10, 'Polarity')
print("Biden's top", len(biden_most_negative), "most negative tweets" )

for idx, tweet in enumerate(biden_most_negative['text']):
    print(idx, tweet)
    print()

## Visualize Sentiment Analysis

In [None]:
plt.figure(figsize=(8,6)) 
plt.scatter(trump_df["Polarity"], trump_df["Subjectivity"], color='orangered')            
plt.title('Trump Sentiment Analysis') 
plt.xlabel('Polarity') 
plt.ylabel('Subjectivity') 
plt.show()

In [None]:
plt.figure(figsize=(8,6)) 
plt.scatter(biden_df["Polarity"], biden_df["Subjectivity"], color='dodgerblue') 

plt.title('Biden Sentiment Analysis') 
plt.xlabel('Polarity') 
plt.ylabel('Subjectivity') 
plt.show()

### View the counts of positive, netural, and negative tweets

In [None]:
trump_df['Sentiment'].value_counts()

In [None]:
plt.title('Trump Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Counts')
trump_df['Sentiment'].value_counts().plot(kind = 'bar', color='orangered')
plt.show()

In [None]:
biden_df['Sentiment'].value_counts()

In [None]:
plt.title('Biden Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Counts')
biden_df['Sentiment'].value_counts().plot(kind = 'bar', color='dodgerblue')
plt.show()

## Percentage of positive/negative tweets of each candidate

### Trump

In [None]:
positive_trump_tweets = trump_df[trump_df.Sentiment == 'Positive']
positive_trump_percent = round(len(positive_trump_tweets) / len(trump_df) * 100, 2)
print(positive_trump_percent, '% of trump tweets are positive.')

In [None]:
# Get % of trump tweets that are neutral
neutral_trump_tweets = trump_df[trump_df.Sentiment == 'Neutral']
neutral_trump_percent = round(len(neutral_trump_tweets) / len(trump_df) * 100, 2)
print(neutral_trump_percent,'% of trump tweets are neutral.')

In [None]:
negative_trump_tweets = trump_df[trump_df.Sentiment == 'Negative']
negative_trump_percent = round(len(negative_trump_tweets) / len(trump_df) * 100, 2)
print(negative_trump_percent, '% of trump tweets are negative.')

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
sentiments = ['positive', 'neutral', 'negative']
percentages = [positive_trump_percent, neutral_trump_percent, negative_trump_percent]
explode = [0.1, 0, 0]

fig1, ax1 = plt.subplots()
pie_colors = ['mediumspringgreen', 'gainsboro', 'salmon']
ax1.pie(percentages, explode=explode, labels=sentiments, autopct='%1.1f%%',
        shadow=True, startangle=90, colors=pie_colors)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title("Trump Sentiments")
plt.show()

### Biden

In [None]:
positive_biden_tweets = biden_df[biden_df.Sentiment == 'Positive']
positive_biden_percent = round(len(positive_biden_tweets) / len(biden_df) * 100, 2)
print(positive_biden_percent, '% of biden tweets are positive.')

In [None]:
neutral_biden_tweets = biden_df[biden_df.Sentiment == 'Neutral']
neutral_biden_percent = round(len(neutral_biden_tweets) / len(biden_df) * 100, 2)
print(neutral_biden_percent, '% of biden tweets are neutral.')

In [None]:
negative_biden_tweets = biden_df[biden_df.Sentiment == 'Negative']
negative_biden_percent = round(len(negative_biden_tweets) / len(biden_df) * 100, 2)
print(negative_biden_percent, '% of biden tweets are negative.')

In [None]:
percentages = [positive_biden_percent, neutral_biden_percent, negative_biden_percent]

fig2, ax2 = plt.subplots()
ax2.pie(percentages, explode=explode, labels=sentiments, autopct='%1.1f%%',
        shadow=True, startangle=90, colors=pie_colors)
ax2.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax2.set_title("Biden Sentiments")
plt.show()

As expected, Trump has a larger proportion of tweets that are perceived as positive (~60% positive compared to Biden's 54% positive). I suspect this is due to Trump's tendency to exaggerate with frequent use of words such as 'great', 'wonderful', etc. Additionally, a slightly larger portion of tweets are negative in comparison to Biden's.

## Did their sentiments change as the pandemic evolved?

### Trump

In [None]:
trump_polarity_by_date = trump_df.groupby(by=trump_df['created_at'].dt.date).mean()['Polarity']
plt.plot(trump_polarity_by_date.index.values, trump_polarity_by_date, color='orangered')
plt.xlabel('Date')
plt.ylabel('Polarity')
plt.title('Trump Average Polarity since December 31 2019')
plt.show()

### Biden

In [None]:
biden_polarity_by_date = biden_df.groupby(by=biden_df['created_at'].dt.date).mean()['Polarity']
plt.plot(biden_polarity_by_date.index.values, biden_polarity_by_date, color='dodgerblue')
plt.xlabel('Date')
plt.ylabel('Polarity')
plt.title('Biden Average Polarity since December 31 2019')
plt.show()

### Both on one plot

In [None]:
plt.plot(trump_polarity_by_date.index.values, trump_polarity_by_date, color='orangered')
plt.plot(biden_polarity_by_date.index.values, biden_polarity_by_date, color='dodgerblue')
plt.xlabel('Date')
plt.ylabel('Polarity')
plt.title('Average Polarity since December 31 2019')
plt.legend(['Trump', 'Biden'])
plt.show()

There doesn't seem to be any clear trend from the data. Future work could include investigating the dates and tweets with the largest peaks/valleys to see if those correlate with any coronavirus news events.

# 6. Topic Modeling with [LDA](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation)<a class="anchor" id="topic_modeling">

# Peprocess data for topic modelling
Remove words with fewer than three characters, perform lemmatization and stemming, include custom stopwords for each candidate to improve topic modelling. 

In [None]:
# Define functions for topic model preprocessing
custom_trump_stopwords = ['realdonaldtrump', 'great', 'pass']
def stem_and_lemmatize(text):
    return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text))


def lda_processing(tokens):
    result = []
    for token in tokens:
        if token not in custom_trump_stopwords:
#         if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in custom_trump_stopwords:
            result.append(stem_and_lemmatize(token))
    return result

In [None]:
trump_df['processed_tokens'] = trump_df['tokens'].apply(lda_processing)
biden_df['processed_tokens'] = biden_df['tokens'].apply(lda_processing)

Before preprocessing:

In [None]:
trump_df['tokens'].head(20)

After preprocessing:

In [None]:
trump_df['processed_tokens'].head(20)

In [None]:
trump_processed_tokens = trump_df['processed_tokens']
biden_processed_tokens = biden_df['processed_tokens']

## Create bags of words on the datasets

In [None]:
trump_dictionary = gensim.corpora.Dictionary(trump_processed_tokens)
biden_dictionary = gensim.corpora.Dictionary(biden_processed_tokens)

In [None]:
print("Number of words in Trump dictionary:", len(trump_dictionary))
print("Number of words in Biden dictionary:", len(biden_dictionary))

In [None]:
# Filter out tokens that appear in less than 10 tweets or more than half the tweets
trump_dictionary.filter_extremes(no_below=10, no_above=0.5)
biden_dictionary.filter_extremes(no_below=10, no_above=0.5)

In [None]:
print("Number of words in Trump dictionary after filtering extremes:", len(trump_dictionary))
print("Number of words in Biden dictionary after filtering extremes:", len(biden_dictionary))

In [None]:
trump_bow_corpus = [trump_dictionary.doc2bow(tweet) for tweet in trump_processed_tokens]
biden_bow_corpus = [biden_dictionary.doc2bow(tweet) for tweet in biden_processed_tokens]

## Improve bag of words with TF-IDF
We can perform topic modelling directly with our bag of words, but we can further improve it first by incorporating [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). 

In [None]:
trump_tfidf = models.TfidfModel(trump_bow_corpus)
biden_tfidf = models.TfidfModel(biden_bow_corpus)

trump_tfidf_corpus = trump_tfidf[trump_bow_corpus]
biden_tfidf_corpus = biden_tfidf[biden_bow_corpus]

## Train the LDA models with different number of clusters

In [None]:
NUM_WORKERS = 3
NUM_PASSES = 15
trump_lda_3 = gensim.models.LdaMulticore(trump_tfidf_corpus, num_topics=3, id2word=trump_dictionary, passes=NUM_PASSES, workers=NUM_WORKERS)
trump_lda_3.save('trump_lda/trump_lda_3.gensim')

In [None]:
trump_lda_5 = gensim.models.LdaMulticore(trump_tfidf_corpus, num_topics=5, id2word=trump_dictionary, passes=NUM_PASSES, workers=NUM_WORKERS)
trump_lda_5.save('trump_lda/trump_lda_5.gensim')

In [None]:
trump_lda_7 = gensim.models.LdaMulticore(trump_tfidf_corpus, num_topics=7, id2word=trump_dictionary, passes=NUM_PASSES, workers=NUM_WORKERS)
trump_lda_7.save('trump_lda/trump_lda_7.gensim')

In [None]:
trump_lda_10 = gensim.models.LdaMulticore(trump_tfidf_corpus, num_topics=10, id2word=trump_dictionary, passes=NUM_PASSES, workers=NUM_WORKERS)
trump_lda_10.save('trump_lda/trump_lda_10.gensim')

In [None]:
biden_lda_3 = gensim.models.LdaMulticore(biden_tfidf_corpus, num_topics=3, id2word=biden_dictionary, passes=NUM_PASSES, workers=NUM_WORKERS)
biden_lda_3.save('biden_lda/biden_lda_3.gensim')

In [None]:
biden_lda_5 = gensim.models.LdaMulticore(biden_tfidf_corpus, num_topics=5, id2word=biden_dictionary, passes=NUM_PASSES, workers=NUM_WORKERS)
biden_lda_5.save('biden_lda/biden_lda_5.gensim')

In [None]:
biden_lda_7 = gensim.models.LdaMulticore(biden_tfidf_corpus, num_topics=7, id2word=biden_dictionary, passes=NUM_PASSES, workers=NUM_WORKERS)
biden_lda_7.save('biden_lda/biden_lda_7.gensim')

In [None]:
biden_lda_10 = gensim.models.LdaMulticore(biden_tfidf_corpus, num_topics=10, id2word=biden_dictionary, passes=NUM_PASSES, workers=NUM_WORKERS)
biden_lda_10.save('biden_lda/biden_lda_10.gensim')

## Look at words in each topic with their relative weights

### Trump

In [None]:
for idx, topic in trump_lda_5.print_topics(num_words=7):
    print("Topic %d:" %idx, topic)

### Biden

In [None]:
for idx, topic in biden_lda_5.print_topics(num_words=7):
    print("Topic %d:" %idx, topic)

## Test models on a new unseen tweet

### Trump

In [None]:
unseen_trump_tweet = 'Big China Virus breakouts all over the World, including nations which were thought to have done a great job. The Fake News doesn’t report this. USA will be stronger than ever before, and soon!'
trump_bow_vector = trump_dictionary.doc2bow(lda_processing(basic_processing(unseen_trump_tweet)))

for index, score in sorted(trump_lda_5[trump_bow_vector], key=lambda t: t[1], reverse=True):
    print("Score: {}\t Topic: {}".format(score, trump_lda_5.print_topic(index, topn=5)))

### Biden

In [None]:
unseen_biden_tweet = 'Our planet can’t take four more years of Donald Trump. We have to get him out of the White House so we can start treating the climate crisis like the urgent threat it is.'
biden_bow_vector = biden_dictionary.doc2bow(lda_processing(basic_processing(unseen_biden_tweet)))

for index, score in sorted(biden_lda_5[biden_bow_vector], key=lambda t: t[1], reverse=True):
    print("Score: {}\t Topic: {}".format(score, biden_lda_5.print_topic(index, topn=5)))

## Visualize topics

### 5 Topics

#### Trump

In [None]:
pyLDAvis.enable_notebook()

trump_lda5_display = pyLDAvis.gensim.prepare(topic_model=trump_lda_5, corpus=trump_tfidf_corpus, dictionary=trump_dictionary)
trump_lda5_display

#### Biden

In [None]:
biden_lda5_display = pyLDAvis.gensim.prepare(topic_model=biden_lda_5, corpus=biden_tfidf_corpus, dictionary=biden_dictionary)
biden_lda5_display

### 7 Topics

#### Trump

In [None]:
trump_lda7_display = pyLDAvis.gensim.prepare(topic_model=trump_lda_7, corpus=trump_tfidf_corpus, dictionary=trump_dictionary)
trump_lda7_display


#### Biden

In [None]:
biden_lda7_display = pyLDAvis.gensim.prepare(topic_model=biden_lda_7, corpus=biden_tfidf_corpus, dictionary=biden_dictionary)
biden_lda7_display

### 10 topics

#### Trump

In [None]:
trump_lda10_display = pyLDAvis.gensim.prepare(topic_model=trump_lda_10, corpus=trump_tfidf_corpus, dictionary=trump_dictionary)
trump_lda10_display

#### Biden

In [None]:
biden_lda10_display = pyLDAvis.gensim.prepare(topic_model=biden_lda_10, corpus=biden_tfidf_corpus, dictionary=biden_dictionary)
biden_lda10_display

As we start to get to 10 topics and above, there begins to be an increasing amount of overlap between the topics.

In the literature, it seems that the biterm topic model (BTM) outperforms the traditional LDA algorithm for shorter text documents such as tweets [[source](https://www.cs.toronto.edu/~jstolee/projects/topic.pdf), [source](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.402.4032&rep=rep1&type=pdf)].

Future work can include comparing the results of these two models and seeing if we observe the suggested improvements. Can also improve evaluation to use more numerical methods such as topic coherence rather than ad-hoc human interpretation of the topics.

# 7. Classification<a class="anchor" id="classification">

In [None]:
def print_classification_results(predictions, y_test, model_name):
    print( model_name, "Accuracy Score:", accuracy_score(predictions, y_test)*100)
    print(model_name, "Classification Report: \n", classification_report(y_test,predictions))
    conf_mat = confusion_matrix(y_test, predictions)
    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=['Biden', 'Trump'], yticklabels=['Biden','Trump'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Candidate')
    plt.ylabel('True Candidate')
    plt.show()

In [None]:
# Assign numeric labels to the classes, biden:0, trump:1
biden_df['label'] = 0
trump_df['label'] = 1

In [None]:
# Merge their two dataframes together into one training dataframe. 
train_df = pd.concat([trump_df, biden_df])

In [None]:
# Use tokens which are not stemmed and lemmatized. Using stemmed and lemmatized tokens led to decreased prediction accuracy.
# Train on the tf_idf vector of our tokens

train_df['training_text'] = train_df['tokens'].apply(TreebankWordDetokenizer().detokenize) 
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(train_df['training_text'])
y = train_df['label']

In [None]:
# Try a handful of classifiers that should be good for text classification

classifiers = [
    LogisticRegression(random_state=0),
    MultinomialNB(),
    svm.SVC(kernel='linear'),
    SGDClassifier(),
    AdaBoostClassifier(),
    MLPClassifier(hidden_layer_sizes=(10,), random_state=0),
    RandomForestClassifier(n_estimators=150, random_state=0)
]

# Evaluate with 10-fold cross validation
CV = 10
classifiers_df = pd.DataFrame(index=range(CV * len(classifiers)))
rows = []

for clf in classifiers:
    print('Training', clf)
    clf_name = clf.__class__.__name__
    accuracies = cross_val_score(clf, X, y, scoring='accuracy', cv=CV, n_jobs=-1)
    for idx, accuracy in enumerate(accuracies):
        rows.append((clf_name, idx, accuracy))
    
classifiers_df = pd.DataFrame(rows, columns=['classifier_name', 'fold', 'accuracy'])
classifiers_df

In [None]:
classifiers_df.groupby('classifier_name').mean().drop(columns=['fold']).sort_values(['accuracy'], ascending=False)

We have multiple classifiers that score very highly. Let's use the top three together in a voting classifier to see if we can improve the accuracy even more. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,train_df['label'], test_size=0.2)

clf1 = MLPClassifier(hidden_layer_sizes=(10,), random_state=0)
clf2 = MultinomialNB()
clf3 = svm.SVC(kernel='linear')

eclf = VotingClassifier(estimators=[('mlp', clf1), ('mnb', clf2), ('svm', clf3)], n_jobs=-1)

clf1 = clf1.fit(X_train, y_train)
clf2 = clf2.fit(X_train, y_train)
clf3 = clf3.fit(X_train, y_train)
eclf = eclf.fit(X_train, y_train)

In [None]:
y_pred = eclf.predict(X_test)
print_classification_results(y_pred, y_test, "Voting Classifier")

Able to slighly improve accuracy from ~93% to ~95% by using the top 3 individual classifiers together in a voting ensemble. 

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
plt.plot(fpr,tpr,linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.show() 