In [1]:
import time
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models, parsing
from gensim.models import LdaModel
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [2]:
nltk.download('wordnet', quiet=True)
np.random.seed(2018)
stemmer = PorterStemmer()

In [3]:
class LDAModel:
    def __init__(self):
        # Min length of document
        self.min_length = 200
        # Num_topics in LDA
        self.num_topics = 90
        # Filter out tokens that appear in less than `no_below` documents (absolute number)
        self.no_below = 50
        # Filter out tokens that appear in more than `no_above` documents (fraction of total corpus size, *not* absolute number).
        self.no_above = 0.2
        # Number of iterations in training LDA model, the less the documents in total, the more the iterations for LDA model to converge
        self.num_of_iterations = 1000

    def _lemmatize_stemming(self, text):
        return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    
    def _preprocess(self, text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(self._lemmatize_stemming(token))
        return result
    
    def _savePickleFile(self, fileName, data):
        fileName = f'./{fileName}.pickle'
        mappingFile = open(fileName, 'wb')
        pickle.dump(data, mappingFile)
        mappingFile.close()

    def saveModel(self, lda, corpus, docs_topics):
        # Save model output
        lda.save('./model')
        # Save corpus
        self._savePickleFile('corpus', corpus)
        self._savePickleFile('docs_topics', docs_topics)

    def trainModel(self):
        data = pd.read_json('../data/movies_data.json', orient='split')
        documents = data['content']
        ids = data['id']
        processed_docs = documents.map(self._preprocess)

        print('Start training LDA model...')
        dictionary = gensim.corpora.Dictionary(processed_docs)
        dictionary.filter_extremes(no_below = self.no_below, no_above = self.no_above, keep_n = self.num_of_iterations)
        corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
        
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=self.num_topics, id2word=dictionary, passes=2, minimum_probability=0.0)
        docs_topics = np.array([[tup[1] for tup in lst] for lst in lda[corpus_tfidf]])
        df_docs_topics = pd.DataFrame(docs_topics, index = ids)
        
        print('Finished training LDA model...')
        
        return lda, corpus_tfidf, df_docs_topics
    
lda_model = LDAModel();
lda, corpus_tfidf, df_docs_topics = lda_model.trainModel()  # train a LDA model using the assgined corpora
lda_model.saveModel(lda, corpus_tfidf, df_docs_topics) # save model for recommendations use

Start training LDA model...
Finished training LDA model...


In [36]:
class Recommender():
    def __init__(self):
        self.lda = LdaModel.load('./model')
        self.corpus = self.loadPickleFile('./corpus')
        self.docs_topics = self.loadPickleFile('./docs_topics')
        self.num_of_recommendation = 10
        
    def loadPickleFile(self, fileName):
        file = open(f'{fileName}.pickle','rb')
        object_file = pickle.load(file)
        return object_file
    
    def recommend(self, movie_id):
        start = time.time()
        
        similarities=[]
        indices=[]
        model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute') 
        model_knn.fit(self.docs_topics)

        distances, indices = model_knn.kneighbors(self.docs_topics.iloc[movie_id, :].values.reshape(1, -1), n_neighbors = self.num_of_recommendation + 1)
        similarities = 1 - distances.flatten()
        similarities = similarities[1:]
        indices = indices.flatten()
        indices = indices[1:]
        
        end = time.time()
        print(f'Finished in: {end - start}')
        return [{'id': self.docs_topics.iloc[[indices[index]]].index.tolist()[0], 'similarity': line} for index, line in enumerate(similarities)]

In [37]:
recommender = Recommender()

In [38]:
recommender.recommend(480)

Finished in: 0.011993408203125


[{'id': 4600, 'similarity': 0.71441936},
 {'id': 6629, 'similarity': 0.70688367},
 {'id': 92, 'similarity': 0.6797502},
 {'id': 7308, 'similarity': 0.6659268},
 {'id': 1278, 'similarity': 0.65155226},
 {'id': 2506, 'similarity': 0.65123165},
 {'id': 56176, 'similarity': 0.641805},
 {'id': 25753, 'similarity': 0.63836807},
 {'id': 2752, 'similarity': 0.63454366},
 {'id': 98908, 'similarity': 0.6334235}]