In [20]:
import os
import time

# data science imports
import math
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# utils import
from fuzzywuzzy import fuzz

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline



In [3]:
# path config
data_path = os.path.join(os.environ['DATA_PATH'], 'MovieLens')
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

KeyError: 'DATA_PATH'

## 1. Load Data

## 3. Train KNN model for item-based collaborative filtering
 - Reshaping the Data
 - Fitting the Model

In [126]:

data = {'movieId': [307, 481, 1091, 1257, 307, 1092, 2222, 1092, 307], 'userId': [1, 1, 1, 1, 2, 2, 2, 3, 3], 'rating':[3.5, 1.5, 4.5, 1.5, 4, 5, 3, 1, 1]}  
df_ratings_drop_users = pd.DataFrame(data)
data = {'movieId': [307, 481, 1091, 1257, 307, 1092, 2222,1092, 307], 'title': ['downtown', 'hydepark', 'great hills', 'sunset valley', 'downtown', 'westlake', 'wampus', 'westlake', 'downtown']}
df_movies = pd.DataFrame(data)




# data = {'movieId': [111, 222, 333, 444, 111, 222, 333, 444], 'userId': [1, 1, 1, 1, 2, 2, 2, 2], 'rating':[1.1, 1.1, 1.1, 5.1, 1.1, 5.1, 1.1, 1.1]}  
# df_ratings_drop_users = pd.DataFrame(data)
# data = {'movieId': [111, 222, 333, 444, 111, 222, 333, 444], 'title': ['downtown', 'hydepark', 'great hills', 'sunset valley', 'downtown', 'hydepark', 'great hills', 'sunset valley']}
# df_movies = pd.DataFrame(data)




In [127]:
# pivot and create movie-user matrix
movie_user_mat = df_ratings_drop_users.pivot(index='movieId', columns='userId', values='rating').fillna(0)
# create mapper from movie title to index
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [128]:
%env JOBLIB_TEMP_FOLDER=/tmp
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5, n_jobs=-1)
# fit
model_knn.fit(movie_user_mat_sparse)

env: JOBLIB_TEMP_FOLDER=/tmp


NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1)

## 4. Use this trained model to make movie recommendations to myself
And we're finally ready to make some recommendations!

In [129]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. If no match found, return None
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True

    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]



def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie


    Parameters
    ----------
    model_knn: sklearn model, knn model

    data: movie-user matrix

    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('You said yes to:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    # inference
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [130]:
my_favorite = 'downtown'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=2)

You said yes to: downtown
Found: ['downtown']

Recommendations for downtown:
1: great hills, with distance of 0.0
2: downtown, with distance of 0.0
