# Keeping Trump on Topic: LIN353C Final Project

By Hannah Brinsko and Aditya Kharosekar

In [101]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import csv
import datetime
import gensim
from gensim.models.keyedvectors import KeyedVectors

## Preprocessing - Scraping tweets, and cleaning them up

Importing tweets from the CSV file - 

In [103]:
tweets_csv = pd.read_csv("tweets.csv")
trump_tweets = tweets_csv[tweets_csv['handle']=="realDonaldTrump"]
print(trump_tweets.shape)
print(trump_tweets['time'].head())

(3218, 28)
5     2016-09-27T22:13:24
8     2016-09-27T21:08:22
11    2016-09-27T20:31:14
12    2016-09-27T20:14:33
13    2016-09-27T20:06:25
Name: time, dtype: object


Looking at the CSV file, we see that it contains tweets only up to 09/27/2016. We need his more recent tweets as well

## Getting his most recent ~3200 tweets.

3200 is approximately the limit to how many tweets Tweepy allows us to scrape. As it turns out, this is more than enough for our use when combined with our CSV.

In [104]:
import tweepy
import json
from tweepy import OAuthHandler
import codecs

consumer_key = "i387QW7Eqgh12UHmK3VoQO9K5"
consumer_secret = "BQI8c5eKale4etdA21mawnFqOmAziDQpnThm679V7UtLjbWlMG"
access_token = "816857419338764288-S8Ay111O2Mo32QAs88tSnv5uKvmGCkF"
access_secret = "HVU19yLuV0klltJl1fsDibAi7Hiq1U4GwsEV9kozTAc1m"

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

all_tweets = []

new_tweets = api.user_timeline(screen_name="realDonaldTrump", count=200)


all_tweets.extend(new_tweets)
oldest = all_tweets[-1].id-1

t = new_tweets[0];

while len(new_tweets) > 0:
   new_tweets = api.user_timeline(screen_name = "realDonaldTrump", count=200, max_id = oldest)
   all_tweets.extend(new_tweets)
   oldest = all_tweets[-1].id-1;

In [105]:
print(len(all_tweets))

3210


We now have his most recent tweets.

However, there is significant overlap between the tweets that we have scraped from his account and the tweets that are in the CSV file.

The latest tweet in the CSV file was posted on September 27, 2016 at 22:13:24. So, we need to keep any scraped tweets which were posted after this.

In [106]:
tweets = []
i = 0
while (all_tweets[i].created_at!=datetime.datetime(2016, 9, 27, 22, 13, 24)):
    tweets.append(all_tweets[i].text)
    i+=1

In [107]:
tweets1 = trump_tweets['text']
tweets1 = tweets1.tolist()
for t in tweets1:
    tweets.append(t)
print(type(tweets))
print("We have ", len(tweets), "tweets to work with")

<class 'list'>
We have  4660 tweets to work with


Making distributional vectors from each tweet

But to do that, we need to - 
1. Remove any twitter links and image links
2. Remove any stopwords
3. Make sure that we have a list of tweets where each tweet is a string
4. Then use CountVectorizer http://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage

### Removing links

In [108]:
temp_tweets = []
for t in tweets:
    temp_tweets.append(t.lower().split())

print(temp_tweets[1])
for t in temp_tweets:
    for w in t:
        if "http" in w or "@" in w: #I've removed any instances where he tags anyone in his tweets. 
                                    #I thought the word vectors might be too sparse if I left those in.
            t.remove(w)
print(temp_tweets[1]) 

['it', 'was', 'a', 'great', 'honor', 'to', 'welcome', "atlanta's", 'heroic', 'first', 'responders', 'to', 'the', 'white', 'house', 'this', 'afternoon!', 'https://t.co/ztc14aj0xs']
['it', 'was', 'a', 'great', 'honor', 'to', 'welcome', "atlanta's", 'heroic', 'first', 'responders', 'to', 'the', 'white', 'house', 'this', 'afternoon!']


## NOTE: This link removal is not working properly. Have to fix later

### Removing stopwords

In [109]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

for t in temp_tweets:
    for w in t:
        if w in stop:
            t.remove(w)

Succesfully removed stopwords. At this point, each tweet is a list of words and temp_tweets is a list. What we need to use CountVectorizer is a list where each element is a string.

Therefore, we need to convert each tweet from a lists of words to a string.

In [110]:
tweets = []
for t in temp_tweets:
    tweets.append(' '.join(t))
type(tweets[0])

str

## Our methodology for classifying tweets

Step 0 - Download a pre-trained Word2Vec model. We tried training our own model, but we did not have enough data.

Step 1 - Hand tag some number of tweets (we ended up tagging about 280 tweets) and classify them into the following categories - 
1. Foreign Policy / International News
2. Domestic Policy / domestic news
3. Tweets about the media
4. Attack tweets
5. Other tweets
6. Tweets about the election

Step 2 - From our hand-tagged corpus, and for each category, create a list of words used.

Step 3 - Create a word vector for each category by summing up the individual word vectors

Step 4 - For each subsequent tweet, find cosine similarity between it and each category vector. Assign that tweet to the category it is most similar to

### Step 0 - Downloading a pre-trained Word2Vec model

In [111]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding = 'utf8')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = embedding
    print("Done.",len(model)," words loaded")
    return model

glove_model = loadGloveModel('glove.txt')

Loading Glove Model
Done. 400000  words loaded


In [112]:
glove_model['hillary']

[0.14675,
 1.1692,
 0.69416,
 -0.061429,
 -0.13677,
 0.42015,
 -0.716,
 0.019014,
 -0.52896,
 -0.83643,
 -1.8561,
 -0.18324,
 0.057648,
 -0.31188,
 0.024997,
 0.045878,
 -0.098728,
 -0.21451,
 0.14298,
 -0.0080809,
 -0.14569,
 0.38326,
 0.63811,
 -0.46426,
 1.0953,
 -2.15,
 -0.18462,
 0.1738,
 -0.50607,
 0.00057719,
 0.52828,
 0.6685,
 -0.89692,
 -0.34346,
 -0.15456,
 -0.97313,
 -0.69441,
 0.59201,
 -1.2194,
 -1.3469,
 -0.25691,
 0.34537,
 -0.43824,
 -0.096233,
 0.29882,
 -0.29174,
 -0.47201,
 -0.32221,
 0.079279,
 0.59419]

In [113]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

In [114]:
google_model.similarity('King', 'Kong')

0.31701805595455718

google_model is our pre-trained model which we will be using.

### Step 1 - Hand tagging tweets

Using a .csv file of ~280 hand tagged tweets, we place the tweets into the preselected categories

In [147]:
foreign = []
domestic =[]
media =[]
attack = []
election = []
other = []

In [148]:
f = open('tagged_tweets.csv')
csv_f = csv.reader(f)

for row in csv_f:
    tweet = row[0]
    cats = row[1]
    if "1" in cats:
        foreign.append(tweet)
    elif "2" in cats:
        domestic.append(tweet)
    elif "3" in cats:
        media.append(tweet)
    elif "5" in cats:
        other.append(tweet)
    elif "6" in cats:
        election.append(tweet)



In [149]:
print("Domestic: ",len(domestic))
print("Foreign: ", len(foreign))
print("Media: ", len(media))
print("Other: ",len(other))
print("Election: ",len(election))

Domestic:  111
Foreign:  62
Media:  77
Other:  107
Election:  80


### Step 2 - Making a list of words used in each category

In [150]:
#Each tweet is a string right now.
#This function will split up the string into individual words, remove any words which start with @
#(i.e our generated tweets won't have any tags)

def clean_up(tweets):
    tweets1 = []
    for t in tweets:
        tweets1.append(t.split())
        
    tweets_words = []
    for t in tweets1:
        for w in t:
            tweets_words.append(w)
    tweets_words = tweets_words
    
    #removing '@' from any word which has it. The google_model does not have any words which start with @
    temp_words = []
    for word in tweets_words:
        if word[0]=='@':
            temp_words.append(word[1:])
        else:
            temp_words.append(word)
    return temp_words

In [151]:
domestic_words = clean_up(domestic)
foreign_words = clean_up(foreign)
media_words = clean_up(media)
election_words = clean_up(election)

### Step 3 - Create a category vector by adding up individual word vectors

In [152]:
def create_category_vector(words):
    vector = np.ones(300)
    for word in words:
        try:
            vector = vector + google_model[word]
        except KeyError: #some words are not in model. I don't want to pre-process everything so I'm just handling each exception
            pass
    return vector

In [153]:
domestic_vector = create_category_vector(domestic_words)
foreign_vector = create_category_vector(foreign_words)
media_vector = create_category_vector(media_words)
election_vector = create_category_vector(election_words)

In [154]:
tweets[56]

'great meeting a wonderful woman today, former secretary state, condoleezza rice! #usa🇺🇸'

In [155]:
def create_tweet_vector(tweet):
    vector = np.ones(300)
    for word in tweet:
        try:
            vector = vector + google_model[word]
        except KeyError:
            pass
    return vector

In [130]:
def calc_cosine_similarity(tweet_vector, category_vector):
    return cosine_similarity(tweet_vector, category_vector)

In [156]:
def calc_scores(tweet):
    score = calc_cosine_similarity(create_tweet_vector(tweet), domestic_vector)
    print("Domestic: ",score)
    score = calc_cosine_similarity(create_tweet_vector(tweet), foreign_vector)
    print("Foreign:", score)
    score = calc_cosine_similarity(create_tweet_vector(tweet), media_vector)
    print("Media:", score)
    score = calc_cosine_similarity(create_tweet_vector(tweet), election_vector)
    print("Election: ",score)

In [157]:
calc_scores(tweets[56])

Domestic:  [[ 0.2913776]]
Foreign: [[ 0.27067024]]
Media: [[ 0.34103627]]
Election:  [[ 0.40958698]]


