In [1]:
import time
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models, parsing
from gensim.models import LdaModel
import nltk
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pyLDAvis.gensim
import itertools
from scipy.sparse import coo_matrix

In [2]:
nltk.download('wordnet', quiet=True)
np.random.seed(2018)
stemmer = PorterStemmer()

In [3]:
class PandasHelper:

    @staticmethod
    def get_id_from_series(series):
        return int(series.index.tolist()[0])

In [33]:
class LDAModel:
    def __init__(self):
        self.no_below = 5
        self.no_above = 0.2
        self.num_topics = 10
        self.num_of_iterations = 100
        self.passes = 3
        self.minimum_probability = 0.01
        self.num_similarities = 30

    def _lemmatize_stemming(self, text):
        return stemmer.stem(WordNetLemmatizer().lemmatize(text))
    
    def _preprocess(self, text):
        unigrams = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in parsing.preprocessing.STOPWORDS and len(token) > 3:
                unigrams.append(self._lemmatize_stemming(token))
        
        return unigrams
    
    def _savePickleFile(self, fileName, data):
        fileName = f'./{fileName}.pickle'
        mappingFile = open(fileName, 'wb')
        pickle.dump(data, mappingFile)
        mappingFile.close()

    def save_model(self, lda, topics):
        lda.save('./model')
        self._savePickleFile('topics', topics)
        
    def save_similarities():
        self._savePickleFile('similarities', similarities)
        
    def get_similarities(self, index, ids):
        sims = []
        coo = coo_matrix(index)
        similarity_matrix = np.zeros(((len(ids)), len(ids)))
        
        print('Started getting LDA similarities')
        for i,j,v in zip(coo.row, coo.col, coo.data):
            similarity_matrix[i, j] = 1 if v > 1 else v
        print('Created similarity matrix')
            
        model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
        model_knn.fit(similarity_matrix)
        print('Created KNN model')
            
        df_similarity_matrix = pd.DataFrame(similarity_matrix, index=ids)
        print('Created DF')
        for i, row in df_similarity_matrix.iterrows():
            movie_row = row.values.reshape(1, -1)
            
            distances, indices = model_knn.kneighbors(movie_row, n_neighbors=self.num_similarities + 1)
            similarities = 1 - distances.flatten()
            similarities = similarities[1:]
            indices = indices.flatten()
            indices = indices[1:]
            
            sims.append({
                'id': i,
                'similarities': [{
                    'id': PandasHelper.get_id_from_series(df_similarity_matrix.iloc[[indices[index]]]),
                    'similarity': float(line)
                } for index, line in enumerate(similarities)]
            })
        print('Finished getting LDA similarities')
        
        return sims
    
    def get_topics(self, lda, corpus, ids):
        documents_topics = [lda.get_document_topics(item) for item in corpus]
        topics = [[tup[1] for tup in lst] for lst in documents_topics]
        df_topics = pd.DataFrame(topics, index=ids)
        
        return df_topics

    def train_model(self):
        data = pd.read_json('../data/movies_data.json', orient='split')
        documents = data['content']
        ids = data['id']
        processed_docs = documents.map(self._preprocess)

        print('Start training LDA model...')
        dictionary = gensim.corpora.Dictionary(processed_docs)
        dictionary.filter_extremes(no_below = self.no_below, no_above=self.no_above)
        corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
        
        tf_idf = models.TfidfModel(corpus)
        corpus_tf_idf = tf_idf[corpus]
        
        lda = LdaModel(
            corpus_tf_idf,
            num_topics=self.num_topics,
            id2word=dictionary,
            passes=self.passes,
            iterations=self.num_of_iterations,
            minimum_probability=self.minimum_probability)
        
        index = gensim.similarities.MatrixSimilarity(corpus_tf_idf)
        print('Finished training LDA model...')
        
        return lda, corpus_tf_idf, index, ids

In [34]:
lda_model = LDAModel()

In [35]:
lda, corpus, index, ids = lda_model.train_model()

Start training LDA model...
Finished training LDA model...


In [36]:
topics = lda_model.get_topics(lda, corpus, ids)
# similarities = lda_model.get_similarities(index, ids)

In [37]:
lda_model.save_model(lda, topics)

In [None]:
# lda_model.save_similarities(similarities)

In [38]:
class Recommender():
    def __init__(self):
        self.lda = LdaModel.load('./model')
        self.similarities = self.loadPickleFile('./similarities')
        self.topics = self.loadPickleFile('./topics')
        self.num_of_recommendation = 10
        
    def loadPickleFile(self, fileName):
        file = open(f'{fileName}.pickle','rb')
        object_file = pickle.load(file)
        return object_file
    
    def recommend(self, movie_id):
        start = time.time()

        sims = list(filter(lambda similarity: similarity['id'] == movie_id, self.similarities))
        
        end = time.time()
        print(f'Recommended in: {end - start} s')
        
        if len(sims) == 0:
            return None
        
        return sims[0]['similarities'][:self.num_of_recommendation]

In [39]:
recommender = Recommender()

In [40]:
recommender.recommend(480)

Recommended in: 0.0029921531677246094 s


[{'id': 1544, 'similarity': 0.6224342338464117},
 {'id': 117529, 'similarity': 0.590238744432192},
 {'id': 136016, 'similarity': 0.5057719744673915},
 {'id': 4519, 'similarity': 0.5010557089695745},
 {'id': 69278, 'similarity': 0.49888860087947273},
 {'id': 1017, 'similarity': 0.47036480381665435},
 {'id': 4445, 'similarity': 0.4501327921976248},
 {'id': 3400, 'similarity': 0.4393099851682969},
 {'id': 69644, 'similarity': 0.4387942109108951},
 {'id': 5504, 'similarity': 0.40948724119046376}]