In [2]:
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
import nltk
from sklearn.neighbors import NearestNeighbors

In [13]:
metric = 'cosine'
n = 10
id = 1

In [4]:
nltk.download('wordnet', quiet=True)
np.random.seed(2018)
stemmer = PorterStemmer()

In [5]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [6]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [7]:
data = pd.read_json('../data/movies_data.json', orient='split')
documents = data['content']
ids = data['id']

In [8]:
processed_docs = documents.map(preprocess)

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [10]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [11]:
lda = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=30, id2word=dictionary, passes=2, minimum_probability=0.0)
docs_topics = np.array([[tup[1] for tup in lst] for lst in lda[corpus_tfidf]])
df_docs_topics = pd.DataFrame(docs_topics, index=data.index)

In [12]:
def recommend(item_id, matrix, metric = metric, k = 10):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(matrix)

    distances, indices = model_knn.kneighbors(matrix.iloc[item_id, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1 - distances.flatten()
            
    return similarities, indices

In [14]:
similarities, indices = recommend(item_id=id, matrix=df_docs_topics, k=n)

In [16]:
recommended = np.delete(indices.flatten(), np.where(indices.flatten() == id))

In [17]:
ids[recommended].values

array([ 4115, 43396,  4996,  1369, 97593,  8009, 44301,  6906,  5875,
        1683], dtype=int64)