# Collaborative Filtering Recommendation System

## Task 1: Import Modules

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

## Task 2: Import the Dataset

In [2]:
import os
os.listdir('.')

['lost+found',
 'app.py',
 '.ipynb_checkpoints',
 'Movie_data.csv',
 'Movie_Id_Titles.csv',
 'CollaborativeFiltering.ipynb']

In [3]:
column_names = ['User_ID', 'User_Names', 'Movie_ID', 'Rating', 'Timestamp']
movie_data = pd.read_csv('Movie_data.csv', names=column_names)

titles_data = pd.read_csv('Movie_Id_Titles.csv')
titles_data.rename(columns = {'item_id' : 'Movie_ID', 'title' : 'Movie_Title'}, inplace=True)

movie_df = pd.merge(movie_data, titles_data, on='Movie_ID')

## Task 3: Explore the Dataset

In [4]:
titles_data[0:5]

Unnamed: 0,Movie_ID,Movie_Title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
movie_df.describe()

Unnamed: 0,User_ID,Movie_ID,Rating,Timestamp
count,100003.0,100003.0,100003.0,100003.0
mean,462.470876,425.520914,3.529864,883528800.0
std,266.622454,330.797791,1.125704,5343791.0
min,0.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [6]:
n_users =movie_df.User_ID.unique().shape[0]
n_movies =movie_df.Movie_ID.unique().shape[0]

In [7]:
n_users, n_movies

(944, 1682)

In [8]:
movie_df[0:5]

Unnamed: 0,User_ID,User_Names,Movie_ID,Rating,Timestamp,Movie_Title
0,0,Shawn Wilson,50,5,881250949,Star Wars (1977)
1,22,Robert Poulin,50,5,878887765,Star Wars (1977)
2,244,Laura Krulik,50,5,880604379,Star Wars (1977)
3,298,Loren Aucoin,50,5,884125578,Star Wars (1977)
4,115,Dominick Jenkins,50,5,881172049,Star Wars (1977)


## Task 4: Create an Interaction Matrix

In [9]:
ratings = np.zeros((n_users, n_movies))

for i in range(len(movie_df)):
    ratings[movie_df['User_ID'][i], movie_df['Movie_ID'][i]-1] = movie_df['Rating'][i]
#     print(movie_df['User_ID'][i], movie_df['Movie_ID'][i], )


In [10]:
print(ratings.shape)
print(ratings)


(944, 1682)
[[0. 0. 0. ... 0. 0. 0.]
 [5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]


## Task 5: Explore the Interaction Matrix

In [11]:
sparsity = len(np.where(ratings == 0)[0])*1.0/len(np.where(ratings > 0)[0])
# print(len(np.where(ratings == 0)), len(np.where(ratings >0)))
sparsity

14.877603671889844

In [12]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print(sparsity)

6.298179628771237


## Task 6 : Create a Similarity Matrix

In [13]:
rating_cosine_similarity = cosine_similarity(ratings)
rating_cosine_similarity

array([[1.        , 0.11988816, 0.11554032, ..., 0.        , 0.18180857,
        0.11890394],
       [0.11988816, 1.        , 0.16693098, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.11554032, 0.16693098, 1.        , ..., 0.16148478, 0.17226781,
        0.10579788],
       ...,
       [0.        , 0.14861694, 0.16148478, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.18180857, 0.17950788, 0.17226781, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.11890394, 0.39817474, 0.10579788, ..., 0.09511958, 0.18246466,
        1.        ]])

## Task 7: Provide Recommendations

In [20]:
def recommend(ratings, rating_cosine_similarity, user_id, k=10, top_n=10):
    
    similar_users = rating_cosine_similarity[user_id]
    
    sorted_users = np.argsort(-1*similar_users)
    
    top_k_users = sorted_users[0:k] ### ignoring 1 because that would be same user
#     print(top_k_users, similar_users.argpartition(-k)[-k:]) ## argpartition does the same thing as decided
    ratings_top_k = ratings[top_k_users]
    avg_ratings_top_k = ratings_top_k.mean(axis=0)
    
    sorted_movies = np.argsort(-1*avg_ratings_top_k).tolist()
    
    seen_movies = np.where(ratings[user_id] > 0)[0].tolist()
#     print(seen_movies)
    
    rec_movies = [x+1 for x in sorted_movies if x not in seen_movies]
#     print(rec_movies[0:top_n])
    
#     rec_movies_a=rec_movies.index.to_frame().reset_index(drop=True)
#     rec_movies_a.rename(columns={rec_movies_a.columns[0]: 'Movie_ID'}, inplace=True)

    ## also create a dataframe from rec_movies
    
    rec_movies_df = pd.DataFrame(rec_movies)
    rec_movies_df = rec_movies_df.head(top_n)
    
#     rec_movies_df=rec_movies_df.index.to_frame().reset_index(drop=True)
    rec_movies_df.rename(columns={rec_movies_df.columns[0]: 'Movie_ID'}, inplace=True)

    return rec_movies[0:top_n], rec_movies_df


In [21]:
recommend(ratings, rating_cosine_similarity, 12)

([181, 496, 210, 423, 173, 385, 568, 79, 566, 22],
    Movie_ID
 0       181
 1       496
 2       210
 3       423
 4       173
 5       385
 6       568
 7        79
 8       566
 9        22)

## Task 8: View the Provided Recommendations 

## Task 9: Create Wrapper Function

In [22]:
def movie_recommender_run(user_Name):
    #Get ID from Name
    user_ID=movie_df.loc[movie_df['User_Names'] == user_Name].User_ID.values[0]
    #Call the function
#     temp=movie_recommender(ratings, rating_cosine_similarity, user_ID)
    _, rec_movies_df = recommend(ratings, rating_cosine_similarity, user_ID)
    # Join with the movie_title_df to get the movie titles
    top_k_rec=rec_movies_df.merge(titles_data, how='inner')
    return top_k_rec

In [23]:
movie_recommender_run('Shawn Wilson')

Unnamed: 0,Movie_ID,Movie_Title
0,181,Return of the Jedi (1983)
1,258,Contact (1997)
2,234,Jaws (1975)
3,174,Raiders of the Lost Ark (1981)
4,692,"American President, The (1995)"
5,210,Indiana Jones and the Last Crusade (1989)
6,144,Die Hard (1988)
7,98,"Silence of the Lambs, The (1991)"
8,114,Wallace & Gromit: The Best of Aardman Animatio...
9,100,Fargo (1996)
