In [1]:
import pandas as pd
import json
import time
import matplotlib
import matplotlib.pyplot as plt
import re
import numpy as np
from nltk.tokenize import RegexpTokenizer
import nltk
# Import all the libraries required
import os
import pandas as pd
import matplotlib.cm as cm
from matplotlib.colors import rgb2hex
from descartes import PolygonPatch
from shapely.geometry import Polygon, MultiPolygon
# Importing Gensim
import gensim
from gensim import corpora
import time
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string



## What topics come up from the tweets that talk about Hillary and Trump separately?

In [2]:
tweets_data = []

#with open('tweets_reduced_10000.jsons', 'r') as tweets_file:
#with open('geotagged_tweets_20160812-0912.jsons', 'r') as tweets_file:
#with open('customTweets.jsons', 'r') as tweets_file:
with open('customTweetsWithSentiment2.jsons', 'r') as tweets_file:
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except Exception as e:
            print (e)
            continue

### Splitting tweets

In [3]:
tweets_trump = [tweet['text'] for tweet in tweets_data if tweet['talk'] == 'trump' and tweet['place'] != None and tweet['place']['country_code'] == 'US']
tweets_hillary = [tweet['text'] for tweet in tweets_data if tweet['talk'] == 'hillary' and tweet['place'] != None and tweet['place']['country_code'] == 'US']

In [4]:
print('Trump only:   ' + str(len(tweets_trump)))
print('Hillary only: ' + str(len(tweets_hillary)))

Trump only:   339787
Hillary only: 144882


### Cleaning

In [6]:
# A function that removes the hyperlinks from the tweet's content.
def remove_link(tweet):
    return re.sub(r"http\S+", "", tweet)

def remove_hashtags(tweet):
    return re.sub('#', '', tweet)

def remove_entire_hashtags(tweet):
    return re.sub(r'(?:\s|^)#[A-Za-z0-9\-\.\_]+(?:\s|$)', ' ', tweet).strip()

def remove_mentions(tweet):
    return " ".join(re.sub("(?:\@|https?\://)\S+"," ",tweet).split())


def remove_non_az_characters(tweet):
    return " ".join(re.sub('[^a-zA-Z]'," ",tweet).split())

In [66]:
remove_mentions("hello @you how is @Albert? #angry https://google.com ")

'hello how is #angry'

In [73]:
remove_non_az_characters("hello 45 @you how is @Albert? #angry https://google.com ")

'hello you how is Albert angry https google com'

In [44]:
remove_entire_hashtags('#start hello #angry you #bye')

'hello you'

In [7]:
# https://pypi.org/project/emoji/

import emoji

def extract_emojis(str):
    return ''.join(c for c in str if c in emoji.UNICODE_EMOJI)

def text_without_emojis(str):
    return ''.join(c for c in str if c not in emoji.UNICODE_EMOJI)

text_with_emoji = "Come to Jesus meeting!!!! What on earth is that supposed to be? 😔 https://t.co/a3lOpTtFig"
print(text_with_emoji)
print(extract_emojis(text_with_emoji))
print(text_without_emojis(text_with_emoji))

Come to Jesus meeting!!!! What on earth is that supposed to be? 😔 https://t.co/a3lOpTtFig
😔
Come to Jesus meeting!!!! What on earth is that supposed to be?  https://t.co/a3lOpTtFig


In [8]:
# Create a set of stopwords
stop = set(stopwords.words('english'))

# Create a set of punctuation words 
exclude = set(string.punctuation) 

# This is the function makeing the lemmatization
lemma = WordNetLemmatizer()

# In this function we perform the entire cleaning
def clean(doc):
    doc = doc.lower()
    doc = remove_link(doc)
    #print('without links:', doc)
    doc = remove_mentions(doc)
    #print('without mentions:', doc)
    doc = text_without_emojis(doc)
    #print('without emojis:', doc)
    doc = remove_entire_hashtags(doc)
    #print('without hashtags:', doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    #print('stop free:', doc)
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    #print('punc_free:', doc)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    #print('normalized:', normalized)
    normalized = remove_non_az_characters(normalized)
    #print('non-az removed:', normalized)
    normalized = " ".join(word for word in normalized.split() if len(word) > 3)
    return normalized

In [82]:
clean('I like reading, so I read 45. @remy: This is @hello waaaaayyyy too much. For you!!!!!! #angry https://google.com 😔')

without links: i like reading, so i read 45. @remy: this is @hello waaaaayyyy too much. for you!!!!!! #angry  😔
without mentions: i like reading, so i read 45. this is waaaaayyyy too much. for you!!!!!! #angry 😔
without emojis: i like reading, so i read 45. this is waaaaayyyy too much. for you!!!!!! #angry 
without hashtags: i like reading, so i read 45. this is waaaaayyyy too much. for you!!!!!!
stop free: i like reading, so i read 45. this is waaaaayyyy too much. for you!!!!!!
punc_free: i like reading, so i read 45. this is waaaaayyyy too much. for you!!!!!!
normalized: like reading read 45 waaaaayyyy much you
non-az removed: like reading read waaaaayyyy much you


'like reading read waaaaayyyy much'

In [83]:
clean('@remy: This is @hello waaaaayyyy too much. For you!!!!!! #angry https://google.com 😔')

without links: @remy: this is @hello waaaaayyyy too much. for you!!!!!! #angry  😔
without mentions: this is waaaaayyyy too much. for you!!!!!! #angry 😔
without emojis: this is waaaaayyyy too much. for you!!!!!! #angry 
without hashtags: this is waaaaayyyy too much. for you!!!!!!
stop free: this is waaaaayyyy too much. for you!!!!!!
punc_free: this is waaaaayyyy too much. for you!!!!!!
normalized: waaaaayyyy much you
non-az removed: waaaaayyyy much you


'waaaaayyyy much'

### Clean words like 'trump' or 'hillary'

In [9]:
def removeKeyWords(candidate, tweets_text):
    keyWords = {
        'hillary' : ['hillary', 'clinton', 'imwithher'],
        'trump' : ['donald', 'trump']
    }
    words = keyWords[candidate]
    for idx, txt in enumerate(tweets_text):
        tweet = ''
        for word in txt.split():
            found = False
            for keyword in words:
                if keyword in word:
                    found = True
                    break
            if not found:    
                tweet += word + ' '
        tweets_text[idx] = tweet.strip()
    return tweets_text  

In [19]:
tweets_text = ['hello hillarybla Hillary clinton im']
removeKeyWords('hillary', tweets_text)

['hello Hillary im']

In [30]:
clean('hello hillarybla Hillary clinton im')

'hello hillarybla hillary clinton'

In [34]:
arr = removeKeyWords('hillary', [clean('hello hillarybla aaaaa Hillary clinton im')])


['hello aaaaa']

In [35]:
def topicModelingCleanedTweets(tweets_text, candidate):
    # This is the clean corpus.
    doc_clean = [removeKeyWords(candidate, [clean(doc)])[0].split() for doc in tweets_text] 
    # Creating the term dictionary of our courpus, where every unique term is assigned an index
    dictionary = corpora.Dictionary(doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    NUM_TOPICS = 4

    start_time = time.time()


    # Creating the object for LDA model using gensim library
    Lda = gensim.models.ldamodel.LdaModel

    # Running and Trainign LDA model on the document term matrix.
    ldamodel = Lda(doc_term_matrix, num_topics=NUM_TOPICS, id2word = dictionary, passes=100)

    elapsed_time = time.time() - start_time
    print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

    # Print 4 topics and describe then with 4 words.
    topics = ldamodel.print_topics(num_topics=NUM_TOPICS, num_words=4)

    i=0
    for topic in topics:
        print ("Topic",i ,"->", topic)     
        i+=1

In [36]:
topicModelingCleanedTweets(tweets_trump, 'trump')

Topic 0 -> (0, '0.035*"today" + 0.026*"pressure" + 0.026*"wind" + 0.026*"rain"')
Topic 1 -> (1, '0.020*"vote" + 0.017*"america" + 0.015*"make" + 0.015*"great"')
Topic 2 -> (2, '0.012*"like" + 0.012*"know" + 0.012*"putin" + 0.008*"tweet"')
Topic 3 -> (3, '0.018*"like" + 0.007*"know" + 0.007*"think" + 0.006*"would"')


In [37]:
topicModelingCleanedTweets(tweets_hillary, 'hillary')

Topic 0 -> (0, '0.018*"health" + 0.016*"email" + 0.016*"better" + 0.010*"question"')
Topic 1 -> (1, '0.021*"woman" + 0.009*"support" + 0.008*"racist" + 0.008*"money"')
Topic 2 -> (2, '0.015*"would" + 0.014*"time" + 0.013*"think" + 0.012*"know"')
Topic 3 -> (3, '0.015*"like" + 0.014*"pneumonia" + 0.012*"deplorable" + 0.011*"look"')
