# Final Project

## We are using Trump and Hillary Tweets before the 2016 Presidential Election and analyzing these two datasets to find the most common words that each respective candidates used.

In [1]:
#Packages to be used in the project. 
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import nltk
import string
#nltk.download('all')
#nltk.download('corpus')


# Loading and Cleaning the Data

In [2]:
#Dataframes for Trump/Hillary tweets.
df_trump = pd.read_csv('Trump_Tweets.csv', encoding='latin-1');

df_trump_hillary = pd.read_csv('Trump_Hillary_Tweets.csv');
df_hillary = df_trump_hillary[df_trump_hillary['handle'] == 'HillaryClinton'];

In [3]:
#Cleaned the Trump tweets.
del df_trump['Unnamed: 10'];
del df_trump['Unnamed: 11'];

In [4]:
df_hillary;

In [5]:
tweet = df_trump['Tweet_Text'][0].split();
print(tweet);

['Today', 'we', 'express', 'our', 'deepest', 'gratitude', 'to', 'all', 'those', 'who', 'have', 'served', 'in', 'our', 'armed', 'forces.', '#ThankAVet', 'https://t.co/wPk7QWpK8Z']


# Methods

### Parsing the Trump Tweets.

In [6]:
#METHOD: Parse Trump tweets and tokenize them for frequencies.

#Retrieved all the trump tweets and combined it into one big string.
whole_str = '';

for trump_tweets in df_trump['Tweet_Text']:
    whole_str += str(' ' + trump_tweets)
    
#Tokenized the big Trump string.
tokens = nltk.tokenize.word_tokenize(whole_str); 

#Filter out all punctuation and common words.
from nltk.corpus import stopwords
s = set(stopwords.words('english'))
f1 = filter(lambda w: not w in s,tokens)
f2 = filter(lambda w: not w in string.punctuation, f1)
f3 = filter(lambda w: not w in ('``','...','--',"''",'https','http','I','amp','The'), f2)

fdist = nltk.FreqDist(f3)

In [7]:
print(fdist)

<FreqDist with 17658 samples and 83048 outcomes>


In [8]:
fdist.most_common(50)

[('realDonaldTrump', 1507),
 ('Trump', 1003),
 ('Thank', 753),
 ('Trump2016', 596),
 ('great', 555),
 ('Hillary', 528),
 ('MakeAmericaGreatAgain', 508),
 ('RT', 448),
 ('people', 376),
 ('We', 344),
 ('Donald', 304),
 ('Great', 299),
 ('CNN', 298),
 ('Clinton', 294),
 ('FoxNews', 272),
 ('America', 269),
 ('like', 253),
 ('New', 252),
 ('get', 230),
 ('Will', 225),
 ('tonight', 222),
 ('Crooked', 215),
 ('AMERICA', 212),
 ('Cruz', 212),
 ('poll', 208),
 ('going', 206),
 ('A', 203),
 ('debate', 194),
 ('one', 193),
 ('GREAT', 191),
 ('would', 191),
 ('last', 188),
 ('country', 181),
 ('said', 180),
 ('time', 175),
 ('back', 174),
 ('big', 174),
 ('He', 174),
 ('Just', 168),
 ('You', 168),
 ('MAKE', 165),
 ('Iowa', 164),
 ('AGAIN', 162),
 ('much', 161),
 ('want', 157),
 ('vote', 156),
 ('President', 155),
 ('GOP', 155),
 ('TRUMP', 151),
 ('many', 150)]

### Parsing the Hillary Tweets.

In [9]:
#METHOD: Parse Hillary Tweets and tokenize them for frequencies.

#Retrieved all the Hillary Tweets and combined it into one big string.
hillary_str = '';

for hillary_tweets in df_hillary['text']:
    hillary_str += str(' ' + hillary_tweets)
    
#Tokenized the big Hillary string.
hillary_tokens = nltk.tokenize.word_tokenize(hillary_str); 

#Filter out all punctuation and common words.
from nltk.corpus import stopwords
s_h = set(stopwords.words('english'))
f1_h = filter(lambda w: not w in s_h, hillary_tokens)
f2_h = filter(lambda w: not w in string.punctuation, f1_h)
f3_h = filter(lambda w: not w in ('``','...','--',"''",'https','http','I','amp'), f2_h)

fdist_h = nltk.FreqDist(f3_h)

In [10]:
fdist_h.most_common(20)

[('Trump', 865),
 ("'s", 807),
 ('Hillary', 424),
 ('Donald', 400),
 ('We', 366),
 ('—', 347),
 ('—Hillary', 308),
 ("n't", 237),
 ('president', 215),
 ('America', 190),
 ('people', 182),
 ('make', 170),
 ('The', 165),
 ('us', 162),
 ('one', 148),
 ('POTUS', 144),
 ('families', 131),
 ('need', 129),
 ('Americans', 125),
 ('would', 122)]

### Finding the ratio between favorites and retweets.

In [11]:
#METHOD: We are finding the ratio between favorites and retweets. 

#Weigh retweets more heavily over favorites. Multiply retweets by ratio so it is weighed.

sum_fav = df_trump['twt_favourites_IS_THIS_LIKE_QUESTION_MARK'].sum();
sum_rtwt = df_trump['Retweets'].sum();

ratio = sum_fav/sum_rtwt

print('The ratio of favorites to retweet is: ', ratio)

The ratio of favorites to retweet is:  2.5952112676056336


### Create tuple of retweets/favorites in relation to words.

In [12]:
#Make a dictionary with words as the key and a tuple of retweets and favorites, unweighted.
#Then, weigh retweets more heavily by multiplying by the ratio.

# Data Visualizations

In [13]:
#Make a histogram. (1-2 graphs)

# Analysis