In [1]:
import time
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models, parsing
from gensim.models import LdaModel
import nltk
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pyLDAvis.gensim

In [3]:
nltk.download('wordnet', quiet=True)
np.random.seed(2018)
stemmer = PorterStemmer()

In [4]:
class PandasHelper:

    @staticmethod
    def get_id_from_series(series):
        return int(series.index.tolist()[0])

In [17]:
class LDAModel:
    def __init__(self):
        # Num_topics in LDA
        self.num_topics = 30
        # Filter out tokens that appear in less than `no_below`
        self.no_below = 5
        self.no_above = 0.2
        # Number of iterations in training LDA model
        self.num_of_iterations = 1000
        self.passes = 5

    def _lemmatize_stemming(self, text):
        return stemmer.stem(WordNetLemmatizer().lemmatize(text))
    
    def _preprocess(self, text):
        unigrams = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                unigrams.append(self._lemmatize_stemming(token))     
        
        return unigrams
    
    def _savePickleFile(self, fileName, data):
        fileName = f'./{fileName}.pickle'
        mappingFile = open(fileName, 'wb')
        pickle.dump(data, mappingFile)
        mappingFile.close()

    def saveModel(self, lda, corpus, docs_topics):
        # Save model output
        lda.save('./model')
        # Save corpus
        self._savePickleFile('corpus', corpus)
        self._savePickleFile('docs_topics', docs_topics)

    def trainModel(self):
        data = pd.read_json('../data/movies_data.json', orient='split')
        documents = data['content']
        ids = data['id']
        processed_docs = documents.map(self._preprocess)
        
        print('Start training LDA model...')
        dictionary = gensim.corpora.Dictionary(processed_docs)
        dictionary.filter_extremes(no_below = self.no_below, no_above=self.no_above)
        corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
        
        tf_idf = models.TfidfModel(corpus)
        corpus_tf_idf = tf_idf[corpus]
        
        lda = LdaModel(
            corpus_tf_idf,
            num_topics=self.num_topics,
            id2word=dictionary,
            passes=self.passes,
            iterations=self.num_of_iterations,
            minimum_probability=0.0)
        
        # lda_display = pyLDAvis.gensim.prepare(lda, corpus_tf_idf, dictionary, sort_topics=False)
        # pyLDAvis.show(lda_display)
        
        docs_topics = np.array([[tup[1] for tup in lst] for lst in lda[corpus_tf_idf]])
        df_docs_topics = pd.DataFrame(docs_topics, index = ids)
        
        print('Finished training LDA model...')
        
        return lda, corpus, df_docs_topics
    
lda_model = LDAModel()
lda, corpus, df_docs_topics = lda_model.trainModel()  # train a LDA model using the assgined corpora
lda_model.saveModel(lda, corpus, df_docs_topics) # save model for recommendations use

Start training LDA model...
Finished training LDA model...


In [18]:
class Recommender():
    def __init__(self):
        self.lda = LdaModel.load('./model')
        self.corpus = self.loadPickleFile('./corpus')
        self.docs_topics = self.loadPickleFile('./docs_topics')
        self.num_of_recommendation = 10
        
    def loadPickleFile(self, fileName):
        file = open(f'{fileName}.pickle','rb')
        object_file = pickle.load(file)
        return object_file
    
    def recommend(self, movie_id):
        data = pd.read_json('../data/movies_data.json', orient='split')
        start = time.time()

        model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
        model_knn.fit(self.docs_topics)

        movie_topics = self.get_movie_topics(movie_id)

        if movie_topics is None:
            return None

        distances, indices = model_knn.kneighbors(movie_topics, n_neighbors=self.num_of_recommendation + 1)
        similarities = 1 - distances.flatten()
        similarities = similarities[1:]
        indices = indices.flatten()
        indices = indices[1:]

        end = time.time()
        print(f'Recommended in: {end - start} s')
        return [{
            'id': PandasHelper.get_id_from_series(self.docs_topics.iloc[[indices[index]]]),
            'name': data[data['id'] == PandasHelper.get_id_from_series(self.docs_topics.iloc[[indices[index]]])].title.iloc[0],
            'similarity': float(line)
        } for index, line in enumerate(similarities)]

    def get_movie_topics(self, movie_id):
        movie_row = self.docs_topics[self.docs_topics.index == movie_id]

        if movie_row.empty:
            return None

        row_values = movie_row.values.reshape(1, -1)

        return row_values

In [19]:
recommender = Recommender()

In [20]:
recommender.recommend(480)

Recommended in: 0.004989147186279297 s


[{'id': 7980, 'name': 'A Bridge Too Far', 'similarity': 0.9765293598175049},
 {'id': 2, 'name': 'Jumanji', 'similarity': 0.9691300988197327},
 {'id': 5099, 'name': 'Heidi', 'similarity': 0.9654021859169006},
 {'id': 5880, 'name': 'Extreme Ops', 'similarity': 0.9634126424789429},
 {'id': 325, 'name': 'Senior Trip', 'similarity': 0.9623314142227173},
 {'id': 8665,
  'name': 'The Bourne Supremacy',
  'similarity': 0.9617115259170532},
 {'id': 112852,
  'name': 'Guardians of the Galaxy',
  'similarity': 0.9607579708099365},
 {'id': 5932, 'name': 'Burden of Dreams', 'similarity': 0.9605057239532471},
 {'id': 1050, 'name': 'Looking for Richard', 'similarity': 0.9604430794715881},
 {'id': 2827,
  'name': "The Astronaut's Wife",
  'similarity': 0.9597653150558472}]