# Topic Modelling

### 1. Strip the data from links & hashtags
### 2. Apply Topic Modelling

In [1]:
import pandas as pd
import re

df = pd.read_csv("data/tweets_by_state.csv")
df = df[["Unnamed: 0.1", "text"]]

def extract_text(text):
    # stripping the text from mentions, links & hashtags
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+|#([^\s]+)|@([^\s]+)'
    text_wo_match = re.sub(regex, '', text)
    return text_wo_match

df['stripped'] = df['text'].apply(lambda tweet: extract_text(tweet))

In [2]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# Create a set of stopwords
stop = set(stopwords.words('english'))

# Create a set of punctuation words 
exclude = set(string.punctuation) 

# This is the function makeing the lemmatization
lemma = WordNetLemmatizer()

# In this function we perform the entire cleaning
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [3]:
df['clean_text'] = df['stripped'].apply(lambda tweet: clean(tweet))
df['clean_text'] = df['clean_text'].apply(lambda tweet: tweet.split(' '))

In [4]:
def remove_amp(text):
    # removes ampersands from the text
    temp = text
    while "amp" in temp:
        temp.remove("amp")
        
    return temp

In [5]:
df['clean_text'] = df['clean_text'].apply(lambda tweet: remove_amp(tweet))

In [6]:
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(df['clean_text'])

# For the 5000 most frequent words, use filter_extremes method (https://stackoverflow.com/questions/36250297/how-to-map-the-word-in-data-frame-to-integer-id-with-python-pandas-and-gensim)
dictionary.filter_extremes(no_below=1, no_above=1, keep_n=5000)

#doc2bow method will get you the bag of words representation (word_id, frequency):
df["bow"] = df["clean_text"].map(dictionary.doc2bow)

In [7]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(df['bow'], num_topics=5, id2word = dictionary, passes=100)

Wall time: 2h 10min 12s


In [8]:
topics = ldamodel.print_topics(num_topics=5, num_words=4)

i=0
for topic in topics:
    print ("Topic",i ,"->", topic)     
    i+=1

Topic 0 -> (0, '0.024*"hillary" + 0.020*"clinton" + 0.016*"tax" + 0.014*"health"')
Topic 1 -> (1, '0.067*"u" + 0.028*"2" + 0.017*"4" + 0.017*"world"')
Topic 2 -> (2, '0.033*"trump" + 0.014*"vote" + 0.013*"see" + 0.012*"im"')
Topic 3 -> (3, '0.018*"lie" + 0.018*"like" + 0.017*"say" + 0.014*"shit"')
Topic 4 -> (4, '0.030*"hillary" + 0.020*"get" + 0.019*"deplorable" + 0.018*"trump"')
