In [None]:
import pandas as pd
import numpy as np

import nltk
import gensim
from gensim import corpora, models
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import pickle

%env JOBLIB_TEMP_FOLDER=/tmp

In [None]:
books = pd.read_csv('books_cleaned_v4.csv')

#Sample on 50000 rows
description = books.loc[:49999,'description'].reset_index().drop(columns = ['index'], axis = 1)
train = description['description']
train

### Train LDA Model

In [None]:
vectorizer = CountVectorizer(strip_accents='unicode')

train_vectorized = vectorizer.fit_transform(train)
print(train_vectorized.shape)
feature_names = vectorizer.get_feature_names()

In [None]:
#Build LDA Model
num_topics = 2000
lda_model = LatentDirichletAllocation(n_components=num_topics,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='batch',   
                                      random_state=0,          # Random state
                                      batch_size=1024,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1
                                      )# Use all available CPUs
                                     

lda_output = lda_model.fit_transform(train_vectorized)

print(lda_model)  # Model attributes

In [None]:
# #Topic Keywords
df_topic_keywords = pd.DataFrame(lda_model.components_)
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.to_csv('df_topic_keywords.csv')

#Top terms in each topic
with open('topicwords.txt', 'w') as f:
    for x in range(0,len(df_topic_keywords)):
        words = df_topic_keywords.loc[x,:]
        words = words.sort_values(ascending=False)
        print('Topic %s:' % x, file=f)
        print (" ".join(i for i in words.index[:30]), file = f)

In [None]:
#Document Term Matrix 

doc_topic_matrix = lda_model.transform(train_vectorized)
df_document_topic = pd.DataFrame(doc_topic_matrix)
df_document_topic.to_csv('df_document_topic.csv')

In [None]:
#Export models for use
pickle.dump(lda_model, open('lda_model.pk', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pk', 'wb'))