In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df= pd.read_csv('u.data', sep= '\t', names= column_names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [3]:
movie_titles = pd.read_csv('Movie_Id_Titles')
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
df= pd.merge(df, movie_titles, on= 'item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100003 entries, 0 to 100002
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    100003 non-null  int64 
 1   item_id    100003 non-null  int64 
 2   rating     100003 non-null  int64 
 3   timestamp  100003 non-null  int64 
 4   title      100003 non-null  object
dtypes: int64(4), object(1)
memory usage: 4.6+ MB


In [6]:
movie_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   item_id  1682 non-null   int64 
 1   title    1682 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


In [7]:
moviemat= df.pivot_table(index='item_id' , columns= 'user_id', values= 'rating').fillna(0)
moviemat.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
2,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
movie_to_idx={
    movie: i for i, movie in
    enumerate(list(movie_titles.set_index('item_id').loc[moviemat.index].title))
}

In [9]:
from scipy.sparse import csr_matrix

In [10]:
#transform matrix to scipy sparse matrix
movie_user_mat_sparse= csr_matrix(moviemat.values)

In [11]:
from sklearn.neighbors import NearestNeighbors

In [12]:
#define model
model_knn=  NearestNeighbors(metric = 'cosine', algorithm='brute', n_neighbors= 20, n_jobs=-1)

#fit
model_knn.fit(movie_user_mat_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [13]:
#utils import
from fuzzywuzzy import fuzz

In [14]:
def fuzzy_matching(mapper, fav_movie, verbose= True):
    """
    return the closest match via fuzzy ratio. if no match found, return none
    
    parameters
    ----------
    mapper: dict, map movie title name to index of movie in data
    
    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True
    
    Return
    ------
    index of the closest match
    """
    match_tuple= []  #This is the list that holds the fav movie from the user
    
    #get match
    for title,idx in mapper.items():  #mapper is a dict (movie_to_idx)
        ratio= fuzz.ratio(title.title(), fav_movie.title())  # get the fuzzy ratio by comparing the fav_mov to other movies
        if ratio>60: #60 is our benchmark
            match_tuple.append((title, idx, ratio)) #we passed in a tuple beacuse append takes just one argument but we need to add title, index and ratio
    #sort
    match_tuple= sorted(match_tuple, key= lambda x: x[2])[::-1] #this sorts all the movies in match_tuple by their fuzzy ratio in a descending order
    if not match_tuple:
        print('oops! no match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple])) #this outputs all the movies that have been appended to match_tuple
    return match_tuple[0][1] #this returns the movie with the highest ratio for th use of the make recommanedation function

In [15]:
def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    #fit
    model_knn.fit(data)
    
    #get input movie index
    print('You have inputted: ', fav_movie.title())
    idx= fuzzy_matching(mapper, fav_movie, verbose= True) #this is the index of the movie that the fuzzy_matching function picked
    
    #inference
    print('Recommendation system starting to make inference')
    print('............\n')
    distances, indices= model_knn.kneighbors(data[idx], n_neighbors= n_recommendations+1) #model_knn.kneighbors(X=None, n_neighbors=None, return_distance=True)
    #..Finds the K-neighbors of a point...Returns indices of and distances to the neighbors of each point.
    
    #get list of row idx recommendations
    raw_recommends=\
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key= lambda x: x[1])[:0:-1]
         #in the above line, squeeze() removes any one-dimensio entity..basically, turns it to 1-d
        #turn list turns the 1-d array to list
        #[:0:-1] reverses the list and then removes the first syntax
    
    #get reverse mapper
    reverse_mapper= {v: k for k, v in mapper.items()}
    #this basically turn the movie_to_idx to the form 0: 'Toy story (1995)'
    
    #print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [16]:
#Let's test our model

my_favorite= 'Toy Story'

make_recommendation(
    model_knn= model_knn,
    data= movie_user_mat_sparse,
    fav_movie= my_favorite,
    mapper= movie_to_idx,
    n_recommendations= 10)

You have inputted:  Toy Story
Found possible matches in our database: ['Toy Story (1995)']

Recommendation system starting to make inference
............

Recommendations for Toy Story:
1: Raiders of the Lost Ark (1981), with distance of 0.3776175050042344
2: Jerry Maguire (1996), with distance of 0.37592529851886347
3: Fargo (1996), with distance of 0.36939923923861095
4: Star Trek: First Contact (1996), with distance of 0.3632726854037742
5: Willy Wonka and the Chocolate Factory (1971), with distance of 0.3618423665130154
6: Mission: Impossible (1996), with distance of 0.3586782396588516
7: Rock, The (1996), with distance of 0.33544521129158
8: Independence Day (ID4) (1996), with distance of 0.31021439592414524
9: Return of the Jedi (1983), with distance of 0.30007502870792213
10: Star Wars (1977), with distance of 0.26622322826178724


In [17]:
my_favorite= 'batman'

make_recommendation(
    model_knn= model_knn,
    data= movie_user_mat_sparse,
    fav_movie= my_favorite,
    mapper= movie_to_idx,
    n_recommendations= 10)

You have inputted:  Batman
Found possible matches in our database: ['Batman (1989)']

Recommendation system starting to make inference
............

Recommendations for batman:
1: Empire Strikes Back, The (1980), with distance of 0.3561644212720909
2: Indiana Jones and the Last Crusade (1989), with distance of 0.3509174945485136
3: Batman Forever (1995), with distance of 0.3436320608215734
4: True Lies (1994), with distance of 0.33560720799637955
5: Speed (1994), with distance of 0.33463389167836677
6: Die Hard 2 (1990), with distance of 0.33024592599604097
7: Top Gun (1986), with distance of 0.31983252759281455
8: Jurassic Park (1993), with distance of 0.3126321455063157
9: Die Hard: With a Vengeance (1995), with distance of 0.3053461989564902
10: Batman Returns (1992), with distance of 0.290836250228915


In [18]:
my_favorite= 'jumanji'

make_recommendation(
    model_knn= model_knn,
    data= movie_user_mat_sparse,
    fav_movie= my_favorite,
    mapper= movie_to_idx,
    n_recommendations= 10)

You have inputted:  Jumanji
Found possible matches in our database: ['Jumanji (1995)']

Recommendation system starting to make inference
............

Recommendations for jumanji:
1: Aladdin (1992), with distance of 0.4994138656238738
2: Batman (1989), with distance of 0.4994100292932976
3: Lion King, The (1994), with distance of 0.49823003087314843
4: Mask, The (1994), with distance of 0.49559264815040494
5: Home Alone (1990), with distance of 0.49551499728939685
6: True Lies (1994), with distance of 0.49259887265568536
7: Star Trek: Generations (1994), with distance of 0.48911316581222863
8: Ghost (1990), with distance of 0.4762300114990823
9: Mrs. Doubtfire (1993), with distance of 0.47079215684117226
10: Jurassic Park (1993), with distance of 0.4397459641755139


In [19]:
my_favorite= 'jurassic park'

make_recommendation(
    model_knn= model_knn,
    data= movie_user_mat_sparse,
    fav_movie= my_favorite,
    mapper= movie_to_idx,
    n_recommendations= 10)

You have inputted:  Jurassic Park
Found possible matches in our database: ['Jurassic Park (1993)']

Recommendation system starting to make inference
............

Recommendations for jurassic park:
1: Terminator, The (1984), with distance of 0.32038398275847
2: Batman (1989), with distance of 0.3126321455063157
3: Star Trek: The Wrath of Khan (1982), with distance of 0.3102836128029677
4: Back to the Future (1985), with distance of 0.30683436310521583
5: True Lies (1994), with distance of 0.29768431561030584
6: Indiana Jones and the Last Crusade (1989), with distance of 0.2953794747465094
7: Empire Strikes Back, The (1980), with distance of 0.2868706467755233
8: Raiders of the Lost Ark (1981), with distance of 0.28428620453787246
9: Speed (1994), with distance of 0.2786054553498638
10: Top Gun (1986), with distance of 0.2658989960853455


In [20]:
my_favorite= 'copycat'

make_recommendation(
    model_knn= model_knn,
    data= movie_user_mat_sparse,
    fav_movie= my_favorite,
    mapper= movie_to_idx,
    n_recommendations= 10)

You have inputted:  Copycat
Found possible matches in our database: ['Copycat (1995)']

Recommendation system starting to make inference
............

Recommendations for copycat:
1: Interview with the Vampire (1994), with distance of 0.5753478211743479
2: Alien 3 (1992), with distance of 0.574702304687835
3: Shining, The (1980), with distance of 0.5697081672629869
4: Candyman (1992), with distance of 0.5646512022790354
5: Silence of the Lambs, The (1991), with distance of 0.5590039950005308
6: Natural Born Killers (1994), with distance of 0.5547581473636551
7: Jaws (1975), with distance of 0.5492197850743941
8: Outbreak (1995), with distance of 0.5276014779035434
9: Nightmare on Elm Street, A (1984), with distance of 0.5272752577404516
10: Cape Fear (1991), with distance of 0.45221294707617576
