In [1]:
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import pickle
import gensim
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
import nltk
from sklearn.neighbors import NearestNeighbors

In [2]:
metric = 'cosine'
n = 10
id = 1

In [3]:
nltk.download('wordnet', quiet=True)
np.random.seed(2018)
stemmer = PorterStemmer()

In [4]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [5]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
data = pd.read_json('../data/movies_data.json', orient='split')
documents = data['content']
ids = data['id']

In [7]:
processed_docs = documents.map(preprocess)

In [8]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [9]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [10]:
lda = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=30, id2word=dictionary, passes=2, minimum_probability=0.0)
docs_topics = np.array([[tup[1] for tup in lst] for lst in lda[corpus_tfidf]])
df_docs_topics = pd.DataFrame(docs_topics, index=data.index)

In [11]:
def recommend(item_id, matrix, metric = metric, k = 10):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(matrix)

    distances, indices = model_knn.kneighbors(matrix.iloc[item_id, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1 - distances.flatten()
            
    return similarities, indices

In [34]:
similarities, indices = recommend(item_id=id, matrix=df_docs_topics, k=n)

In [35]:
similarities = similarities[1:]
indices = indices.flatten()
indices = indices[1:]

In [37]:
[{indices[index]: line} for index, line in enumerate(similarities)]

[{3147: 0.9981442},
 {6688: 0.9957245},
 {3742: 0.99559164},
 {1529: 0.9954368},
 {7914: 0.99541885},
 {4522: 0.99512655},
 {6420: 0.995115},
 {5855: 0.9949963},
 {4309: 0.9949244},
 {1131: 0.9942949}]