### Latent Dirichlet Allocation (LDA)

In [17]:
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop_words.pkl')
data

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,abc,ability,abject,able,ablebodied,...,yulin,yummy,yyou,zealand,zeppelin,zero,zillion,zombie,zoning,éclair
louis,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,2,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,0,0,1,1,2,0,...,1,1,0,0,0,0,0,0,0,0
bo,0,1,1,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
bill,1,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,1,2,1,0
jim,0,0,0,0,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,1
hasan,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
ali,0,0,0,0,0,0,0,0,2,0,...,0,0,1,0,0,0,0,1,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,10,0,0,0,0,0,0


In [18]:
from gensim import matutils, models
import scipy.sparse

In [19]:
term_doc_matrix = data.transpose()
term_doc_matrix

Unnamed: 0,louis,dave,ricky,bo,bill,jim,john,hasan,ali,anthony,mike,joe
aaaaah,0,0,0,0,1,0,0,0,0,0,0,0
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaah,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
zero,2,0,0,1,1,0,0,1,0,0,1,0
zillion,0,0,0,0,1,0,0,0,0,0,0,0
zombie,0,0,0,0,2,0,0,0,1,0,0,0
zoning,0,0,0,0,1,0,0,0,0,0,0,0


In [20]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(term_doc_matrix)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [21]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open('cv_stop_words.pkl', "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [22]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.007*"shit" + 0.007*"life" + 0.007*"going" + 0.006*"fucking" + 0.006*"look" + 0.005*"love" + 0.005*"theyre" + 0.005*"say" + 0.005*"shes" + 0.004*"mean"'),
 (1,
  '0.010*"voice" + 0.007*"joke" + 0.006*"little" + 0.006*"fucking" + 0.005*"love" + 0.005*"year" + 0.005*"went" + 0.004*"theyre" + 0.004*"shit" + 0.004*"going"'),
 (2,
  '0.016*"fucking" + 0.009*"shit" + 0.008*"theyre" + 0.007*"went" + 0.006*"going" + 0.004*"girl" + 0.004*"look" + 0.004*"house" + 0.004*"white" + 0.004*"life"')]

In [23]:
from nltk import word_tokenize, pos_tag

In [24]:
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [25]:
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript,full_name
louis,introfade music let roll hold light light than...,Ali Wong
dave,dave tell dirty joke living stare hard work ha...,Anthony Jeselnik
ricky,hello hello great thank calm shut fuck thank l...,Bill Burr
bo,macdonald farm farm snort macdonald farm appla...,Bo Burnham
bill,cheer applause right thank thank much thank th...,Dave Chappelle
jim,horn honk audience cheering announcer lady gen...,Hasan Minhaj
john,armed boyish charm sharp former writer john mu...,Jim Jefferies
hasan,theme music orchestral hiphop crowd roar whats...,Joe Rogan
ali,lady gentleman please welcome stage wong hello...,John Mulaney
anthony,thank thank thank francisco thank much good pe...,Louis C.K.


In [26]:
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
louis,introfade music let roll light light thank tha...
dave,tell dirty joke stare hard work signifies trai...
ricky,hello great thank calm fuck thank welcome gon ...
bo,macdonald farm farm snort macdonald farm appla...
bill,cheer applause right thank much thank thank th...
jim,horn honk audience announcer lady gentleman pl...
john,boyish charm sharp former writer john mulaney ...
hasan,theme music orchestral hiphop crowd roar whats...
ali,lady gentleman please welcome stage wong hello...
anthony,thank thank thank much good people gon tape sp...


In [27]:
# Create a new document-term matrix using only nouns and adjectives
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said', 'ahah', 'dude']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns and adjectives
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaah,ability,abject,able,ablebodied,abortion,absolute,...,youngest,youth,youtube,yummy,yyou,zealand,zeppelin,zillion,zombie,éclair
louis,0,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0
dave,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,1,1,2,0,0,0,...,1,0,1,0,0,0,0,0,0,0
bo,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bill,1,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,1,2,0
jim,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
hasan,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ali,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,1,0,0,0,1,0
anthony,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,9,0,0,0,0


In [28]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [31]:
# Let's start with n topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.004*"rape" + 0.004*"dead" + 0.004*"anthony" + 0.004*"shark" + 0.003*"young" + 0.003*"grandma" + 0.003*"tweet" + 0.003*"older" + 0.003*"nut" + 0.003*"jenner"'),
 (1,
  '0.025*"voice" + 0.010*"laughter" + 0.008*"audience" + 0.005*"robotic" + 0.005*"repeat" + 0.005*"hasan" + 0.004*"gun" + 0.004*"american" + 0.004*"applause" + 0.004*"hate"'),
 (2,
  '0.004*"jenny" + 0.004*"wife" + 0.004*"wan" + 0.004*"parent" + 0.003*"sense" + 0.003*"idea" + 0.003*"stupid" + 0.003*"marriage" + 0.003*"sound" + 0.003*"sleep"')]

In [38]:
### 0 - age
### 1 - stage
### 2 - family

In [32]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(0, 'louis'),
 (0, 'dave'),
 (0, 'ricky'),
 (1, 'bo'),
 (2, 'bill'),
 (1, 'jim'),
 (2, 'john'),
 (1, 'hasan'),
 (2, 'ali'),
 (0, 'anthony'),
 (2, 'mike'),
 (2, 'joe')]