# A recommendation system is a platform that provides its users with various contents based on their preferences and likings. A recommendation system takes the information about the user as an input. The recommendation system is an implementation of the machine learning algorithms.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf 
import seaborn as sns


# read the dataset

In [2]:
movie_data = pd.read_csv('C:/Users/User/Desktop/ml-latest-small/movies.csv',error_bad_lines=False, header=0, usecols=[0,1,2] ,index_col=0, names=['movieId', 'Title','Genre'])
movie_data.head()

Unnamed: 0_level_0,Title,Genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
movie_data.isnull().any()

Title    False
Genre    False
dtype: bool

In [5]:
movie_data.describe()

Unnamed: 0,Title,Genre
count,9742,9742
unique,9737,951
top,War of the Worlds (2005),Drama
freq,2,1053


In [7]:

movie_ratings = pd.read_csv('C:/Users/User/Desktop/ml-latest-small/ratings.csv', error_bad_lines=False, usecols=[0,1,2],header=0, names=['userId','movieId', 'rating'] )
movie_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [8]:
movie_ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [9]:
def movieTitle(movieId):
    title = movie_data.at[movieId, 'Title']
    return title
movieTitle(1)

'Toy Story (1995)'

In [10]:
def movieGenre(movieId):  
    genre = movie_data.at[movieId, 'Genre']
    return  genre
movieGenre(1)

'Adventure|Animation|Children|Comedy|Fantasy'

In [None]:
# Data Preprocessing for huge dataset (However here Not Required)
# to select only those movies whose id is present in movie_data
# movie_ratings = movie_ratings[movie_ratings['movieId'].isin(movie_data.index)]

In [11]:
def favMovie(userId, N):
    userRatings = movie_ratings[movie_ratings.userId==userId]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'] ,ascending=[0])[:N]
    sortedRatings['Title'] = sortedRatings['movieId'].apply(movieTitle)
    sortedRatings['Genre'] = sortedRatings['movieId'].apply(movieGenre)
    return sortedRatings
favMovie(1, 10)

Unnamed: 0,userId,movieId,rating,Title,Genre
231,1,5060,5.0,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War
185,1,2872,5.0,Excalibur (1981),Adventure|Fantasy
89,1,1291,5.0,Indiana Jones and the Last Crusade (1989),Action|Adventure
90,1,1298,5.0,Pink Floyd: The Wall (1982),Drama|Musical
190,1,2948,5.0,From Russia with Love (1963),Action|Adventure|Thriller
189,1,2947,5.0,Goldfinger (1964),Action|Adventure|Thriller
188,1,2944,5.0,"Dirty Dozen, The (1967)",Action|Drama|War
186,1,2899,5.0,Gulliver's Travels (1939),Adventure|Animation|Children
184,1,2858,5.0,American Beauty (1999),Drama|Romance
179,1,2700,5.0,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical


# Setup Rating Matrix

In [12]:
movie_ratings.shape, movie_data.shape

((100836, 3), (9742, 2))

In [13]:
userPerMovieID = movie_ratings.movieId.value_counts()
userPerMovieID.head()

356     329
318     317
296     307
593     279
2571    278
Name: movieId, dtype: int64

In [14]:
userPerMovieID.shape

(9724,)

In [15]:
## Data Preprocessing to obtain less sparse matrix for huge dataset(However here Not Required)

    ## Take only those movies which are seen by more than 10 users
#movie_ratings = movie_ratings[movie_ratings.index.isin(userPerMovieID[userPerMovieID > 10].index)]
#movie_ratings.shape

In [16]:
userMovieRatingMatrix = pd.pivot_table(movie_ratings, index=['userId'],columns=['movieId'] ,values='rating')
userMovieRatingMatrix.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# Find K Nearest Neighbours
user1 = 100
user2 = 200

user1_ratings = userMovieRatingMatrix.transpose()[user1]
user1_ratings.head()

movieId
1    NaN
2    NaN
3    3.5
4    NaN
5    NaN
Name: 100, dtype: float64

In [18]:
user2_ratings = userMovieRatingMatrix.transpose()[user2]
user2_ratings.head()

movieId
1    3.5
2    NaN
3    NaN
4    NaN
5    4.0
Name: 200, dtype: float64

In [19]:
from scipy.spatial.distance import hamming
# hamming() returns a value which shows the pecentage of disagreement

hamming(user1_ratings, user1_ratings)

0.9847799259563965

In [20]:
# Wrapping it up in a function
def distance(user1, user2):
    try:
        user1_ratings = userMovieRatingMatrix.transpose()[user1]
        user2_ratings = userMovieRatingMatrix.transpose()[user2]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.nan
    return distance
distance(100,200)

0.9987659399424106

In [21]:

user = 1
allusers = pd.DataFrame(userMovieRatingMatrix.index)
# Removing the activee user
allusers = allusers[allusers.userId != user]
allusers.head()

Unnamed: 0,userId
1,2
2,3
3,4
4,5
5,6


In [22]:
allusers['distance'] = allusers['userId'].apply(lambda x: distance(user, x))
allusers.head()

Unnamed: 0,userId,distance
1,2,1.0
2,3,0.999897
3,4,0.998149
4,5,0.999486
5,6,0.99928


In [23]:
K = 10
KNearestUsers = allusers.sort_values(['distance'], ascending=True)['userId'][:K]
KNearestUsers

413    414
379    380
44      45
596    597
447    448
554    555
468    469
451    452
265    266
201    202
Name: userId, dtype: int64

In [24]:

# Wrapping it up in a function
def nearestNeighbours(user, K=10):
    allusers = pd.DataFrame(userMovieRatingMatrix.index)
    allusers = allusers[allusers.userId != user]
    allusers['distance'] = allusers['userId'].apply(lambda x: distance(user, x))
    KNearestUsers = allusers.sort_values(['distance'], ascending=True)['userId'][:K]
    return KNearestUsers

KNearestNeighbours = nearestNeighbours(1,5)
KNearestNeighbours

413    414
379    380
44      45
596    597
447    448
Name: userId, dtype: int64

In [25]:
# Find Top N Recommendations
# Nearest Neighbours ratings

NNratings = userMovieRatingMatrix[userMovieRatingMatrix.index.isin(KNearestNeighbours)]
NNratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45,4.0,,,,3.0,4.0,3.0,,,,...,,,,,,,,,,
380,5.0,5.0,,,,5.0,,,,5.0,...,,,,,,,,,,
414,4.0,3.0,4.0,,2.0,3.0,3.0,3.0,,3.0,...,,,,,,,,,,
448,5.0,3.0,3.0,,3.0,,,,,4.0,...,,,,,,,,,,
597,4.0,,,,,3.0,1.0,,,3.0,...,,,,,,,,,,


In [26]:
#Getting the average rating of each movie seen by Nearest Neighbours of active user
avgRating = NNratings.apply(np.nanmean).dropna()
avgRating.head()

# warning where the columns of NNratings are completely empty(nan)

  labels=labels)


movieId
1    4.400000
2    3.666667
3    3.500000
5    2.666667
6    3.750000
dtype: float64

In [27]:
moviesAlreadySeen = userMovieRatingMatrix.transpose()[user].dropna().index
moviesAlreadySeen

Int64Index([   1,    3,    6,   47,   50,   70,  101,  110,  151,  157,
            ...
            3671, 3702, 3703, 3729, 3740, 3744, 3793, 3809, 4006, 5060],
           dtype='int64', name='movieId', length=232)

In [28]:
# Removing the movies which are already seen by user
avgRating = avgRating[~avgRating.index.isin(moviesAlreadySeen)]

In [29]:
N=3
topNMovieId = avgRating.sort_values(ascending=False).index[:N]
topNMovieId

Int64Index([99813, 7099, 5617], dtype='int64', name='movieId')

In [30]:
pd.Series(topNMovieId).apply(movieTitle)

0       Batman: The Dark Knight Returns, Part 2 (2013)
1    Nausicaä of the Valley of the Wind (Kaze no ta...
2                                     Secretary (2002)
Name: movieId, dtype: object

In [31]:
pd.Series(topNMovieId).apply(movieGenre)

0                            Action|Animation
1    Adventure|Animation|Drama|Fantasy|Sci-Fi
2                        Comedy|Drama|Romance
Name: movieId, dtype: object

In [32]:
# Wrapping it up in a function
def topN(user,N=3):
    KnearestUsers = nearestNeighbours(user)
    NNRatings = userMovieRatingMatrix[userMovieRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    moviesAlreadySeen = userMovieRatingMatrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(moviesAlreadySeen)]
    topNMovieId = avgRating.sort_values(ascending=False).index[:N]
    return pd.DataFrame({'Movie':pd.Series(topNMovieId).apply(movieTitle), 'Genre':pd.Series(topNMovieId).apply(movieGenre)})

In [33]:
favMovie(3,5)

Unnamed: 0,userId,movieId,rating,Title,Genre
289,3,5181,5.0,Hangar 18 (1980),Action|Sci-Fi|Thriller
298,3,70946,5.0,Troll 2 (1990),Fantasy|Horror
296,3,7991,5.0,Death Race 2000 (1975),Action|Sci-Fi
266,3,849,5.0,Escape from L.A. (1996),Action|Adventure|Sci-Fi|Thriller
294,3,6835,5.0,Alien Contamination (1980),Action|Horror|Sci-Fi


In [34]:
# To remove the RunTimeWarning error 
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

topN(3,5)

Unnamed: 0,Movie,Genre
0,Reform School Girls (1986),Action|Drama
1,Watership Down (1978),Adventure|Animation|Children|Drama|Fantasy
2,"Omega Man, The (1971)",Action|Drama|Sci-Fi|Thriller
3,Yojimbo (1961),Action|Adventure
4,"Mystery, Alaska (1999)",Comedy|Drama
