In [1]:
import string
from os.path import join 
import os
import pickle

import pandas as pd

DATA = 'transcripts'

from nltk import word_tokenize, pos_tag


def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'S' or  pos[:2] == 'A'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized, lang='rus') if is_noun(pos)] 
    return ' '.join(all_nouns)


# Clean text to keep only nouns and adjectives
data_clean = pd.read_pickle(join(DATA,"clean.pkl"))
data_nouns = pd.DataFrame(data_clean.text.apply(nouns_adj))


In [2]:
## Rebuild Document-Term Matrix
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

russian_stopwords = stopwords.words("russian")


cv = CountVectorizer(stop_words=russian_stopwords)
data_cv = cv.fit_transform(data_clean.text)

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index

from gensim import matutils, models
import scipy.sparse

# We're going to put the term-document matrix into a new gensim format, 
# from df --> sparse matrix --> gensim corpus
corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtm.transpose()))

# Create the vocabulary dictionary
id2word = dict((v, k) for k, v in cv.vocabulary_.items())



# Let's try 4 topics
lda = models.LdaModel(corpus=corpus, num_topics=4, id2word=id2word, passes=80)
print('Topics = ')
lda.print_topics()

Topics = 


[(0,
  '0.007*"становиться" + 0.007*"который" + 0.007*"человек" + 0.007*"наука" + 0.006*"общество" + 0.006*"это" + 0.006*"весь" + 0.005*"голова" + 0.004*"мир" + 0.004*"свой"'),
 (1,
  '0.000*"мера" + 0.000*"общественный" + 0.000*"группа" + 0.000*"результат" + 0.000*"большинство" + 0.000*"приговор" + 0.000*"допустимый" + 0.000*"выступать" + 0.000*"девочка" + 0.000*"заявлять"'),
 (2,
  '0.008*"год" + 0.007*"тайга" + 0.007*"это" + 0.005*"процент" + 0.005*"становиться" + 0.005*"который" + 0.005*"день" + 0.005*"удэгеец" + 0.004*"человек" + 0.004*"красный"'),
 (3,
  '0.007*"это" + 0.007*"весь" + 0.006*"юрий" + 0.005*"перемена" + 0.005*"дело" + 0.005*"реконструкция" + 0.005*"человек" + 0.004*"который" + 0.004*"свой" + 0.003*"суд"')]

In [3]:



#Let's take a look at which topics each transcript contains
corpus_transformed = lda[corpus]

top_list = list(zip([a for [(a,b)] in corpus_transformed], data_dtm.index))

print('Topics by authors = ')
top_list

Topics by authors = 


[(3, 'Bowt'),
 (3, 'Mlshtn'),
 (2, 'Kuval'),
 (0, 'Dvdv'),
 (3, 'Prav'),
 (2, 'Mrz'),
 (3, 'Znam'),
 (2, 'Mikol'),
 (0, 'Inoz'),
 (0, 'Mashk')]