# Content Based Filtering
## Movies Recommendation

In [1]:
import pandas as pd
import numpy as np

### Reading Data and PreProcessing

In [2]:
movies_df = pd.read_csv('../data/movies/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv('../data/movies/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


### Creating Genre Table for all movieIds

In [4]:
moviesWithGenre_df = movies_df   # Creating a Copy

moviesWithGenre_df['genresList'] = moviesWithGenre_df['genres'].str.split('|')  # Creating Genres List

for index, row in movies_df.iterrows():
    for genre in row['genresList']:
         moviesWithGenre_df.at[index, genre] = 1

moviesWithGenre_df = moviesWithGenre_df.fillna(0)  # Replacing NaN with 0's
moviesWithGenre_df = moviesWithGenre_df.drop(['title', 'genres', 'genresList'], axis=1)
moviesWithGenre_df.head()

Unnamed: 0,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Picking a random User for Recommendation

In [5]:
# np.random.seed(0)
userId = np.random.randint(1, np.max(np.unique(ratings_df['userId'])))
userId

8401

### Getting Input User Ratings

In [6]:
user_ratings_df = ratings_df[ratings_df['userId'] == userId]
user_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
776981,8401,50,4.0,1365373531
776982,8401,110,4.5,1365373839
776983,8401,168,3.5,1365373213
776984,8401,236,3.5,1365373220
776985,8401,296,4.5,1365373698


##### Formating the user_ratings_df for just movie id and ratings

In [7]:
user_ratings_df = user_ratings_df.drop(['userId', 'timestamp'], axis=1)
user_ratings_df = user_ratings_df.reset_index(drop=True)
user_ratings_df.head()

Unnamed: 0,movieId,rating
0,50,4.0
1,110,4.5
2,168,3.5
3,236,3.5
4,296,4.5


### Creating Movies Matrix for the user Movies

In [8]:
user_moviesM_df = moviesWithGenre_df[moviesWithGenre_df['movieId'].isin(user_ratings_df['movieId'].tolist())]
user_moviesM_df = user_moviesM_df.reset_index(drop=True)
user_moviesM_df.head()

Unnamed: 0,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,110,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,168,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,236,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,296,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Removing movieId from both the tables

In [9]:
user_moviesM_df = user_moviesM_df.drop('movieId', axis=1)
user_ratings_df = user_ratings_df.drop('movieId', axis=1)

### Multiplying user_ratings_df with user_moviesM_df to get User profile

In [10]:
user_profile = user_moviesM_df.transpose().dot(user_ratings_df['rating'])
user_profile

Adventure             18.0
Animation              3.5
Children              11.5
Comedy                85.0
Fantasy                0.0
Romance               52.0
Drama                 99.0
Action                42.0
Crime                 39.0
Thriller              30.5
Horror                 0.5
Mystery               18.0
Sci-Fi                 0.0
IMAX                   0.0
Documentary            0.0
War                   21.5
Musical                4.0
Western                4.5
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

### Creating Candidate Movie Matrix

In [11]:
candidate_moviesM_df = moviesWithGenre_df
candidate_moviesM_df = candidate_moviesM_df.set_index('movieId')
candidate_moviesM_df.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Multiplying Candidate Movie Matix with user Profile to get weighted Candidate movie Matrix and finding aggregate of each row to get recomendation of Each movie

In [12]:
recommendation_vector = ((candidate_moviesM_df * user_profile).sum(axis=1)) / (user_profile.sum())
recommendation_vector[:5]

movieId
1    0.275058
2    0.068765
3    0.319347
4    0.550117
5    0.198135
dtype: float64

### Finding top 20 recommendations

In [13]:
recommendation_vector = recommendation_vector.sort_values(ascending=False).head(20)
recommendation_vector = recommendation_vector.sort_index()
recommendation_vector

movieId
1912      0.712121
3893      0.712121
4719      0.818182
4956      0.761072
26236     0.740093
27344     0.698135
27781     0.755245
64645     0.730769
75408     0.860140
76153     0.860140
81132     0.784382
83266     0.803030
106078    0.738928
121370    0.748252
122787    0.730769
124681    0.754079
127341    0.810023
144338    0.765734
144606    0.712121
150268    0.765734
dtype: float64

In [14]:
movies_recomendation_df = movies_df.drop(movies_df.columns[3:], axis=1)
movies_recomendation_df = movies_recomendation_df[movies_recomendation_df['movieId'].isin(recommendation_vector.keys())]
movies_recomendation_df = movies_recomendation_df.set_index('movieId', drop=False)
movies_recomendation_df['recommendationFactor'] = recommendation_vector
movies_recomendation_df = movies_recomendation_df.sort_values('recommendationFactor', ascending=False)
movies_recomendation_df.reset_index(drop=True)

Unnamed: 0,movieId,title,genres,recommendationFactor
0,75408,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...,Action|Animation|Comedy|Crime|Drama|Mystery|Ro...,0.86014
1,76153,Lupin III: First Contact (Rupan Sansei: Faasut...,Action|Animation|Comedy|Crime|Drama|Mystery|Ro...,0.86014
2,4719,Osmosis Jones (2001),Action|Animation|Comedy|Crime|Drama|Romance|Th...,0.818182
3,127341,Longshot (2001),Action|Comedy|Crime|Drama|Romance|Thriller,0.810023
4,83266,Kaho Naa... Pyaar Hai (2000),Action|Adventure|Comedy|Drama|Mystery|Romance|...,0.80303
5,81132,Rubber (2010),Action|Adventure|Comedy|Crime|Drama|Film-Noir|...,0.784382
6,144338,Holiday (2006),Action|Children|Comedy|Crime|Drama|Romance,0.765734
7,150268,Dilwale (2015),Action|Children|Comedy|Crime|Drama|Romance,0.765734
8,4956,"Stunt Man, The (1980)",Action|Adventure|Comedy|Drama|Romance|Thriller,0.761072
9,27781,Svidd Neger (2003),Comedy|Crime|Drama|Horror|Mystery|Romance|Thri...,0.755245
