In [10]:
import time
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models, parsing
from gensim.models import LdaModel
import nltk
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pyLDAvis.gensim
import itertools
from scipy.sparse import coo_matrix

In [2]:
nltk.download('wordnet', quiet=True)
np.random.seed(2018)
stemmer = PorterStemmer()

In [3]:
class PandasHelper:

    @staticmethod
    def get_id_from_series(series):
        return int(series.index.tolist()[0])

In [26]:
class LDAModel:
    def __init__(self):
        self.no_below = 5
        self.no_above = 0.2
        self.num_topics = 10
        self.num_of_iterations = 100
        self.passes = 2
        self.minimum_probability = 0.01

    def _lemmatize_stemming(self, text):
        return stemmer.stem(WordNetLemmatizer().lemmatize(text))
    
    def _preprocess(self, text):
        unigrams = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in parsing.preprocessing.STOPWORDS and len(token) > 3:
                unigrams.append(self._lemmatize_stemming(token))
                
        # bi_grams = ['_'.join(b) for b in nltk.bigrams(unigrams)]
        # tri_grams = ['_'.join(t) for t in nltk.trigrams(unigrams)]
        
        return list(itertools.chain(unigrams))
    
    def _savePickleFile(self, fileName, data):
        fileName = f'./{fileName}.pickle'
        mappingFile = open(fileName, 'wb')
        pickle.dump(data, mappingFile)
        mappingFile.close()

    def saveModel(self, lda, corpus, df):
        # Save model output
        lda.save('./model')
        # Save corpus
        self._savePickleFile('corpus', corpus)
        self._savePickleFile('df', df)

    def trainModel(self):
        data = pd.read_json('../data/movies_data.json', orient='split')
        documents = data['content']
        ids = data['id']
        processed_docs = documents.map(self._preprocess)

        print('Start training LDA model...')
        dictionary = gensim.corpora.Dictionary(processed_docs)
        dictionary.filter_extremes(no_below = self.no_below, no_above=self.no_above)
        corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
        
        tf_idf = models.TfidfModel(corpus)
        corpus_tf_idf = tf_idf[corpus]
        
        lda = LdaModel(
            corpus_tf_idf,
            num_topics=self.num_topics,
            id2word=dictionary,
            passes=self.passes,
            iterations=self.num_of_iterations,
            minimum_probability=self.minimum_probability)
        
        index = gensim.similarities.MatrixSimilarity(corpus_tf_idf)
        coo = coo_matrix(index)
        similarity_matrix = np.zeros(((len(ids)), len(ids)))
        
        for i,j,v in zip(coo.row, coo.col, coo.data):
            similarity_matrix[i, j] = 1 if v > 1 else v
            
        df_similarity_matrix = pd.DataFrame(similarity_matrix, index=ids, columns=ids)
        
        # lda_display = pyLDAvis.gensim.prepare(lda, corpus_tf_idf, dictionary, sort_topics=False)
        # pyLDAvis.show(lda_display)
        
        print('Finished training LDA model...')
        
        return lda, corpus, df_similarity_matrix
    
lda_model = LDAModel()
lda, corpus, df = lda_model.trainModel()  # train a LDA model using the assgined corpora
lda_model.saveModel(lda, corpus, df) # save model for recommendations use

Start training LDA model...




            0         1         2         3         4         5         6     \
id                                                                             
6       1.000000  0.000000  0.002988  0.004544  0.000000  0.000000  0.008515   
4       0.000000  1.000000  0.003184  0.065896  0.000000  0.000000  0.085236   
1       0.002988  0.003184  1.000000  0.000000  0.000000  0.006623  0.005563   
10      0.004544  0.065896  0.000000  1.000000  0.000000  0.004847  0.010860   
9       0.000000  0.000000  0.000000  0.000000  1.000000  0.106605  0.000000   
2       0.000000  0.000000  0.006623  0.004847  0.106605  1.000000  0.010237   
7       0.008515  0.085236  0.005563  0.010860  0.000000  0.010237  1.000000   
5       0.028263  0.007246  0.005911  0.000000  0.032243  0.000000  0.023146   
3       0.040046  0.023752  0.000000  0.003382  0.000000  0.005555  0.008085   
8       0.000000  0.033567  0.005793  0.022331  0.000000  0.000000  0.009994   
11      0.000000  0.000000  0.000000  0.

In [None]:
class Recommender():
    def __init__(self):
        self.lda = LdaModel.load('./model')
        self.corpus = self.loadPickleFile('./corpus')
        self.docs_topics = self.loadPickleFile('./docs_topics')
        self.num_of_recommendation = 10
        
    def loadPickleFile(self, fileName):
        file = open(f'{fileName}.pickle','rb')
        object_file = pickle.load(file)
        return object_file
    
    def recommend(self, movie_id):
        data = pd.read_json('../data/movies_data.json', orient='split')
        start = time.time()

        model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
        model_knn.fit(self.docs_topics)

        movie_topics = self.get_movie_topics(movie_id)

        if movie_topics is None:
            return None

        distances, indices = model_knn.kneighbors(movie_topics, n_neighbors=self.num_of_recommendation + 1)
        similarities = 1 - distances.flatten()
        similarities = similarities[1:]
        indices = indices.flatten()
        indices = indices[1:]

        end = time.time()
        print(f'Recommended in: {end - start} s')
        return [{
            'id': PandasHelper.get_id_from_series(self.docs_topics.iloc[[indices[index]]]),
            'name': data[data['id'] == PandasHelper.get_id_from_series(self.docs_topics.iloc[[indices[index]]])].title.iloc[0],
            'similarity': float(line)
        } for index, line in enumerate(similarities)]

    def get_movie_topics(self, movie_id):
        movie_row = self.docs_topics[self.docs_topics.index == movie_id]

        if movie_row.empty:
            return None

        row_values = movie_row.values.reshape(1, -1)

        return row_values

In [None]:
recommender = Recommender()

In [None]:
recommender.recommend(480)