In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

#Importing Dataset

ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

#Importing the movies dataset from where we have got the ratings

movies_df = pd.read_csv('movies.csv')
movies_df.head()

#DATA PREPROCESSING

"""In movies dataset we have year along with the title"""
"""So first we will extract year from title and assign it to a new column"""

movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))', expand = True)#picking year column from dataset, extract patterns using string title/year
#(\(\d\d\d\d\) the initial \is for initiating extraction and \d 4 times for year 2000,1999, used to extract/replace something 
movies_df.head()

#Remove parantheses from year

movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)', expand=True)# no initial \ coz we are removing
movies_df.head()

#Remove year from title

movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df.head()

#Remove all the whitespaces/gap from title

movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())# lamba is a module used to remove spaces x.strip is used to remove whitespaces

#Convert Genres into a list

movies_df['genres'] = movies_df.genres.str.split('|')#split means remove |(see in data) and adding , , , (by default) and makes list
movies_df.head()

#One Hot Encoding of Genres - basically converts categorical data into binary columns

movies_copy = movies_df.copy()# we create a copy to avoid modifying original dataset

for index, row in movies_df.iterrows():# data set contains rows and columns
  for genre in row['genres']:
    movies_copy.at[index, genre] = 1

movies_copy.head()

#Filling NAN values with 0 

movies_copy = movies_copy.fillna(0)# replaces empty rows with 0 -NAN
movies_copy.head()

#Now let's check ratings dataset

ratings_df.head()

#Timestamp column is not necessary, so we can drop it

ratings_df = ratings_df.drop(['timestamp'], axis=1)# remove from ratings dataset, remove timestap columns
ratings_df.head()

#CONTENT BASED RECOMMENDATION SYSTEM

"""Let's start by taking User Input for ratings of different movies"""

user_input = [
              {'title' : 'Grand Slam', 'rating' : 5.6},
              {'title' : 'Zero', 'rating' : 7},
              {'title' : 'Jumanji', 'rating' : 8.5},
              {'title' : 'Toy Story', 'rating' : 4.5}
]#dummy thing that user might input, dictionary

movies_input = pd.DataFrame(user_input)#manipulate tables
movies_input 

#Add movieID to user input
#First we will filter selected movies from original dataset

input_id = movies_df[movies_df['title'].isin(movies_input['title'].tolist())]#isin ( user input is in main movie or not .....), .tolist converting output to python list

#Merging the two datasets

movies_input = pd.merge(input_id, movies_input)
movies_input

#Drop the unnecessary columns like genres and year

movies_input = movies_input.drop(['genres','year'], axis=1)
movies_input

#Now we will check for same movies given in input in original dataset

movies_user = movies_copy[movies_copy['movieId'].isin(movies_input['movieId'].tolist())]
movies_user

#Reset index of this dataset 

movies_user = movies_user.reset_index(drop=True)
movies_user

#Create a Genre Table out of this dataset

UserGenreTable = movies_user.drop(['movieId','title','genres','year'], axis=1)
UserGenreTable

#dot product to get weights

UserProfile = UserGenreTable.transpose().dot(movies_input['rating'])

#User Profile for every genre

UserProfile

#Create a genre table for every movie in original datset

GenreTable = movies_copy.set_index(movies_copy['movieId'])
GenreTable

GenreTable = GenreTable.drop(['movieId','title','genres','year'], axis=1)
GenreTable.head()

#Final Recommendation value for each movie

Recommendation_df = ((GenreTable*UserProfile).sum(axis=1))/UserProfile.sum()
Recommendation_df.head()

#Sort the values to get movies with high recommendation values

Recommendation_df = Recommendation_df.sort_values(ascending=False)
Recommendation_df.head()

#Final recommendation table for movies

RecommendationTable =  movies_df.loc[movies_df['movieId'].isin(Recommendation_df.head(20).keys())]
RecommendationTable

  movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))', expand = True)
  movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)', expand=True)
  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II (1995),[Comedy],1995
5,6,Heat (1995),"[Action, Crime, Thriller]",1995
6,7,Sabrina (1995),"[Comedy, Romance]",1995
7,8,Tom and Huck (1995),"[Adventure, Children]",1995
8,9,Sudden Death (1995),[Action],1995
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]",1995
