In [1]:
# Libraries
import pandas as pd
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from nltk.corpus import stopwords
import re

import pyLDAvis
import pyLDAvis.gensim

In [2]:
# reading in data from contest 853
df = pd.read_csv("853.csv")

df.head()

Unnamed: 0,caption,mean,precision,votes,not_funny,somewhat_funny,funny
0,Who knew the Swiss had a navy?,1.675661,0.018367,1816,946,513,357
1,Weapons down. It's the Swiss.,1.672884,0.020759,1394,721,408,265
2,"Wow, and that’s just the tip of the Jarlsberg!",1.643323,0.01838,1685,886,512,286
3,All we need now is to find a port,1.639823,0.012828,3837,2151,917,769
4,We must be directly over where Wisconsin used ...,1.638809,0.022989,1041,542,333,166


In [3]:
# removing columns
df_subset = df.drop(columns=['mean', 'precision', 'votes', 'not_funny', 'somewhat_funny', 'funny'], axis=1)

# subsetting first 100 rows
df_subset = df_subset.head(100)

df_subset.head()

Unnamed: 0,caption
0,Who knew the Swiss had a navy?
1,Weapons down. It's the Swiss.
2,"Wow, and that’s just the tip of the Jarlsberg!"
3,All we need now is to find a port
4,We must be directly over where Wisconsin used ...


In [4]:
# stopwords
stopwords = stopwords.words("english")

In [5]:
# Remove punctuation
df_subset['caption_processed'] = \
df_subset['caption'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
df_subset['caption_processed'] = \
df_subset['caption_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
df_subset.head()

  df_subset['caption'].map(lambda x: re.sub('[,\.!?]', '', x))


Unnamed: 0,caption,caption_processed
0,Who knew the Swiss had a navy?,who knew the swiss had a navy
1,Weapons down. It's the Swiss.,weapons down it's the swiss
2,"Wow, and that’s just the tip of the Jarlsberg!",wow and that’s just the tip of the jarlsberg
3,All we need now is to find a port,all we need now is to find a port
4,We must be directly over where Wisconsin used ...,we must be directly over where wisconsin used ...


In [6]:
# lemmatization

def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_texts = lemmatization(df_subset.caption_processed)
print(lemmatized_texts)



In [7]:
# removing stopwords and tokenizing
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc = True)
        final.append(new)
    return(final)
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stopwords] for doc in texts]

data_words = gen_words(lemmatized_texts)
data_words = remove_stopwords(data_words)
print(data_words)



In [8]:
# creating a dictionary

id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)
    
print(corpus[0][0:20])

word = id2word[[0][:1][0]]
print(word)

[(0, 1), (1, 1), (2, 1)]
know


In [9]:
# creating model

lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                           id2word = id2word,
                                           num_topics = 30,
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = "auto")

In [10]:
# visualizing model

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds = "mmds", R = 30)

vis

