In [25]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [26]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [27]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [28]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[0]

'Adventure Animation Children Comedy Fantasy'

In [14]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

In [29]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [61]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    last_user_movie = user_movies[-1]
    movie_genres = title_genres[last_user_movie]
    movie_genres = change_string(movie_genres)
    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)
    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [70]:
def recommend_for_user2(user_id):
    n_users, n_items = movielens['train'].shape
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    last_user_movie = user_movies[-1]
    movie_genres = title_genres[last_user_movie]
    movie_genres = change_string(movie_genres)
    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)
    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    movies_to_score = movies.iloc[res[1][0]].title.values
    for movie in movies_to_score:
        if movie in user_movies:
            continue
        scores = algo.predict(user_id, np.arange(n_items))

        top_items = movielens['item_labels'][np.argsort(-scores)] 
        top_scores = scores[np.argsort(-scores)] 
        
    for i in top_items[:10]:
            print("        %s" % i)

In [64]:
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)
recommend_for_user(2.0)

Enron: The Smartest Guys in the Room (2005) 3.7804474659150693
Searching for Sugar Man (2012) 3.751202220628989
Tomorrow (2015) 3.6603845140170717
My Kid Could Paint That (2007) 3.627018629135345
Haunted World of Edward D. Wood Jr., The (1996) 3.6059823813510965
Samsara (2011) 3.6007414480354623
Nobody Speak: Hulk Hogan, Gawker and Trials of a Free Press (2017) 3.5637914349157866
6 Days to Air: The Making of South Park (2011) 3.5544160787235293
Waiting for 'Superman' (2010) 3.5492816055445666
Get Me Roger Stone (2017) 3.545838242408712


In [33]:
algo = SVDpp()
algo.fit(trainset)
recommend_for_user(2.0)

Searching for Sugar Man (2012) 3.7153681198391437
Tomorrow (2015) 3.698318645237553
My Kid Could Paint That (2007) 3.682948512919022
Samsara (2011) 3.680480879382157
Enron: The Smartest Guys in the Room (2005) 3.6784279766574635
Nobody Speak: Hulk Hogan, Gawker and Trials of a Free Press (2017) 3.6532387553660244
Haunted World of Edward D. Wood Jr., The (1996) 3.594513337651411
Get Me Roger Stone (2017) 3.5891634798491885
Waiting for 'Superman' (2010) 3.5807115060073267
Comandante (2003) 3.5701625936160957


In [72]:
from lightfm.datasets import fetch_movielens
movielens = fetch_movielens()
train = movielens['train']
test = movielens['test']
algo = LightFM(learning_rate=0.05, loss='warp')
algo.fit_partial(train, epochs=10)
recommend_for_user2(2.0)

        Air Force One (1997)
        L.A. Confidential (1997)
        English Patient, The (1996)
        Contact (1997)
        Devil's Advocate, The (1997)
        Titanic (1997)
        G.I. Jane (1997)
        Scream (1996)
        Conspiracy Theory (1997)
        Cop Land (1997)
