In [24]:
import time
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models, parsing
from gensim.models import LdaModel
import nltk
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
from sklearn.neighbors import NearestNeighbors
import spacy
from spacy.lang.en import English
import pyLDAvis.gensim
import itertools

In [12]:
parser = English()
nltk.download('wordnet', quiet=True)
np.random.seed(2018)
stemmer = PorterStemmer()

In [3]:
class PandasHelper:

    @staticmethod
    def get_id_from_series(series):
        return int(series.index.tolist()[0])

In [32]:
class LDAModel:
    def __init__(self):
        # Num_topics in LDA
        self.num_topics = 20
        # Filter out tokens that appear in less than `no_below`
        self.no_below = 5
        # Number of iterations in training LDA model, the less the documents in total, the more the iterations for LDA model to converge
        self.num_of_iterations = 1000

    def _lemmatize_stemming(self, text):
        return stemmer.stem(WordNetLemmatizer().lemmatize(text))
    
    def _preprocess(self, text):
        unigrams = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
                unigrams.append(self._lemmatize_stemming(token))
                
        bi_grams = ['_'.join(b) for b in nltk.bigrams(unigrams)]
        tri_grams = ['_'.join(t) for t in nltk.trigrams(unigrams)]       
        
        return list(itertools.chain(unigrams, bi_grams, tri_grams))
    
    def _savePickleFile(self, fileName, data):
        fileName = f'./{fileName}.pickle'
        mappingFile = open(fileName, 'wb')
        pickle.dump(data, mappingFile)
        mappingFile.close()

    def saveModel(self, lda, corpus, docs_topics):
        # Save model output
        lda.save('./model')
        # Save corpus
        self._savePickleFile('corpus', corpus)
        self._savePickleFile('docs_topics', docs_topics)

    def trainModel(self):
        data = pd.read_json('../data/movies_data.json', orient='split')
        documents = data['content']
        ids = data['id']
        processed_docs = documents.map(self._preprocess)
        
        print('Start training LDA model...')
        dictionary = gensim.corpora.Dictionary(processed_docs)
        dictionary.filter_extremes(no_below = self.no_below)
        corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
        
        lda = LdaModel(
            corpus,
            num_topics=self.num_topics,
            id2word=dictionary,
            passes=5,
            alpha=[0.01]*self.num_topics,
            iterations=self.num_of_iterations,
            minimum_probability=0.0)
        
        lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
        pyLDAvis.show(lda_display)
        '''
        docs_topics = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])
        df_docs_topics = pd.DataFrame(docs_topics, index = ids)
        
        print('Finished training LDA model...')
        
        return lda, corpus, df_docs_topics
        '''
    
lda_model = LDAModel()
lda_model.trainModel()
# lda, corpus, df_docs_topics = lda_model.trainModel()  # train a LDA model using the assgined corpora
# lda_model.saveModel(lda, corpus, df_docs_topics) # save model for recommendations use

Start training LDA model...

Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [17/Dec/2018 19:04:40] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [17/Dec/2018 19:04:40] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [17/Dec/2018 19:04:40] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [17/Dec/2018 19:04:40] code 404, message Not Found
127.0.0.1 - - [17/Dec/2018 19:04:40] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [17/Dec/2018 19:04:41] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...


In [29]:
class Recommender():
    def __init__(self):
        self.lda = LdaModel.load('./model')
        self.corpus = self.loadPickleFile('./corpus')
        self.docs_topics = self.loadPickleFile('./docs_topics')
        self.num_of_recommendation = 10
        
    def loadPickleFile(self, fileName):
        file = open(f'{fileName}.pickle','rb')
        object_file = pickle.load(file)
        return object_file
    
    def recommend(self, movie_id):
        data = pd.read_json('../data/movies_data.json', orient='split')
        start = time.time()

        model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
        model_knn.fit(self.docs_topics)

        movie_topics = self.get_movie_topics(movie_id)

        if movie_topics is None:
            return None

        distances, indices = model_knn.kneighbors(movie_topics, n_neighbors=self.num_of_recommendation + 1)
        similarities = 1 - distances.flatten()
        similarities = similarities[1:]
        indices = indices.flatten()
        indices = indices[1:]

        end = time.time()
        print(f'Recommended in: {end - start} s')
        return [{
            'id': PandasHelper.get_id_from_series(self.docs_topics.iloc[[indices[index]]]),
            'name': data[data['id'] == PandasHelper.get_id_from_series(self.docs_topics.iloc[[indices[index]]])].title.iloc[0],
            'similarity': float(line)
        } for index, line in enumerate(similarities)]

    def get_movie_topics(self, movie_id):
        movie_row = self.docs_topics[self.docs_topics.index == movie_id]

        if movie_row.empty:
            return None

        row_values = movie_row.values.reshape(1, -1)

        return row_values

In [30]:
recommender = Recommender()

In [31]:
recommender.recommend(480)

Recommended in: 0.004979610443115234 s


[{'id': 101106, 'name': 'Sound City', 'similarity': 0.9524266719818115},
 {'id': 2237, 'name': 'Without Limits', 'similarity': 0.9186468720436096},
 {'id': 720,
  'name': 'Wallace & Gromit: The Best of Aardman Animation',
  'similarity': 0.9042388796806335},
 {'id': 98122,
  'name': 'Indie Game: The Movie',
  'similarity': 0.8975331783294678},
 {'id': 5864, 'name': 'Tarzan the Ape Man', 'similarity': 0.8922789096832275},
 {'id': 2363, 'name': 'Godzilla', 'similarity': 0.8891057968139648},
 {'id': 1542, 'name': 'Brassed Off', 'similarity': 0.8884487152099609},
 {'id': 2045, 'name': 'A Far Off Place', 'similarity': 0.88751620054245},
 {'id': 39779,
  'name': 'Tarzan and His Mate',
  'similarity': 0.8810970783233643},
 {'id': 2295, 'name': 'The Impostors', 'similarity': 0.8727961182594299}]