### Latent Dirichlet Allocation (LDA)

In [3]:
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop_words.pkl')
data

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo,éclair
louis,0,0,0,0,0,3,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,1,0,0
jim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
hasan,0,0,0,0,0,0,0,0,0,0,...,2,1,0,1,0,0,0,0,0,0
ali,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from gensim import matutils, models
import scipy.sparse

In [5]:
term_doc_matrix = data.transpose()
term_doc_matrix

Unnamed: 0,louis,dave,ricky,bo,bill,jim,john,hasan,ali,anthony,mike,joe
aaaaah,0,0,0,0,1,0,0,0,0,0,0,0
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaah,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
zombie,0,0,0,0,1,0,0,0,1,0,0,0
zombies,0,0,0,0,1,0,0,0,0,0,0,0
zoning,0,0,0,0,1,0,0,0,0,0,0,0
zoo,0,0,1,0,0,0,0,0,0,0,0,0


In [6]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(term_doc_matrix)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [15]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open('cv_stop_words.pkl', "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [33]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.013*"voice" + 0.007*"shit" + 0.006*"guy" + 0.005*"fucking" + 0.005*"fuck" + 0.005*"man" + 0.004*"love" + 0.004*"want" + 0.004*"good" + 0.004*"didnt"'),
 (1,
  '0.007*"fucking" + 0.007*"went" + 0.006*"going" + 0.006*"say" + 0.005*"day" + 0.004*"little" + 0.004*"theyre" + 0.004*"want" + 0.004*"dad" + 0.004*"hes"'),
 (2,
  '0.008*"fucking" + 0.007*"shit" + 0.006*"fuck" + 0.006*"theyre" + 0.006*"cause" + 0.005*"say" + 0.005*"theres" + 0.005*"thing" + 0.005*"life" + 0.005*"didnt"')]

In [18]:
from nltk import word_tokenize, pos_tag

In [19]:
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [20]:
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
louis,introfade music lets roll hold lights lights t...
dave,dave tells dirty jokes living stare hard work ...
ricky,hello hello great thank wow calm shut fuck tha...
bo,old macdonald farm farm pig snort old macdonal...
bill,cheers applause right thank thank much thank t...
jim,car horn honks audience cheering announcer lad...
john,armed boyish charm sharp wit former snl writer...
hasan,theme music orchestral hiphop crowd roars what...
ali,ladies gentlemen please welcome stage ali wong...
anthony,thank thank thank san francisco thank much goo...


In [21]:
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
louis,introfade music lights lights thank much appre...
dave,dave tells dirty jokes stare hard work signifi...
ricky,hello great thank wow calm fuck thank welcome ...
bo,old macdonald farm farm pig snort old macdonal...
bill,cheers right thank much thank thank thank what...
jim,car horn honks audience announcer ladies welco...
john,boyish charm sharp wit former snl writer john ...
hasan,theme music orchestral hiphop crowd roars davi...
ali,ladies gentlemen welcome stage ali wong hello ...
anthony,thank thank thank san francisco thank good peo...


In [42]:
# Create a new document-term matrix using only nouns and adjectives
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said', 'ahah', 'dude']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns and adjectives
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aah,abc,ability,abject,able,ablebodied,abortion,...,yummy,yyou,zealand,zee,zen,zeppelin,zillion,zombie,zombies,éclair
louis,0,0,0,3,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
dave,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,0,1,1,2,0,0,...,1,0,0,0,0,0,0,0,0,0
bo,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,1,1,0
jim,0,0,0,0,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,1
hasan,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
ali,0,0,0,0,1,0,0,2,0,0,...,0,1,0,0,0,0,0,1,0,0
anthony,0,0,0,0,0,0,0,0,0,2,...,0,0,9,0,0,0,0,0,0,0


In [43]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [44]:
# Let's start with n topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.005*"mom" + 0.004*"parents" + 0.004*"laughter" + 0.004*"audience" + 0.004*"gun" + 0.003*"dog" + 0.003*"wife" + 0.003*"hasan" + 0.003*"guns" + 0.003*"ass"'),
 (1,
  '0.007*"joke" + 0.004*"jokes" + 0.003*"anthony" + 0.003*"fucking" + 0.003*"dead" + 0.003*"mad" + 0.003*"bruce" + 0.003*"jenner" + 0.003*"door" + 0.002*"gay"'),
 (2,
  '0.024*"voice" + 0.007*"bro" + 0.005*"robotic" + 0.005*"repeat" + 0.005*"jenny" + 0.004*"eye" + 0.004*"andy" + 0.003*"contact" + 0.003*"wan" + 0.003*"sad"')]

In [38]:
### 0 - spouse
### 1 - husbands
### 2 - 

In [45]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(0, 'louis'),
 (1, 'dave'),
 (1, 'ricky'),
 (2, 'bo'),
 (0, 'bill'),
 (0, 'jim'),
 (0, 'john'),
 (0, 'hasan'),
 (2, 'ali'),
 (1, 'anthony'),
 (2, 'mike'),
 (1, 'joe')]