In [14]:
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models, parsing
from gensim.models import LdaModel
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np

In [2]:
nltk.download('wordnet', quiet=True)
np.random.seed(2018)
stemmer = PorterStemmer()

In [20]:
class LDAModel:
    def __init__(self):
        # Min length of document
        self.min_length = 200
        # Num_topics in LDA
        self.num_topics = 90
        # Filter out tokens that appear in less than `no_below` documents (absolute number)
        self.no_below = 50
        # Filter out tokens that appear in more than `no_above` documents (fraction of total corpus size, *not* absolute number).
        self.no_above = 0.2
        # Number of iterations in training LDA model, the less the documents in total, the more the iterations for LDA model to converge
        self.num_of_iterations = 1000
        # Number of passes in the model
        self.passes = 3

    def _lemmatize_stemming(self, text):
        return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    
    def _preprocess(self, text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(self._lemmatize_stemming(token))
        return result
    
    def _savePickleFile(self, fileName, objectName):
        fileName = f'./{fileName}.pickle'
        mappingFile = open(fileName, 'wb')
        pickle.dump(objectName, mappingFile)
        mappingFile.close()

    def saveModel(self, lda, corpus):
        # Save model output
        lda.save('./model')
        # Save corpus
        self._savePickleFile('corpus', corpus)

    def trainModel(self):
        data = pd.read_json('../data/movies_data.json', orient='split')
        documents = data['content']
        processed_docs = documents.map(self._preprocess)

        print('Start training LDA model...')
        dictionary = gensim.corpora.Dictionary(processed_docs)
        dictionary.filter_extremes(no_below = self.no_below, no_above = self.no_above, keep_n = self.num_of_iterations)
        corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
        
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=self.num_topics, id2word=dictionary, passes=2, minimum_probability=0.0)
        
        print('Finished training LDA model...')
        
        return lda, corpus_tfidf
    
lda_model = LDAModel();
lda, corpus_tfidf = lda_model.trainModel()  # train a LDA model using the assgined corpora
lda_model.saveModel(lda, corpus_tfidf) # save model for recommendations use

Start training LDA model...
Finished training LDA model...


In [16]:
class Recommender():
    def __init__(self):
        self.lda = LdaModel.load('./model')
        self.num_of_recommendation = 10