In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from ast import literal_eval

import warnings
warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv('/content/drive/MyDrive/RecomSysData/movies_metadata.csv')
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


# Building a Knowledge based recommender:
 
 it relies on asking the user for some information like, what genre does he/she prefer?, what is the length of the film?, what is the timeline presented? etc..

 We will make our dataframe smaller by keeping only the required columns for us.

In [3]:
df = movies[['title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         45460 non-null  object 
 1   genres        45466 non-null  object 
 2   release_date  45379 non-null  object 
 3   runtime       45203 non-null  float64
 4   vote_average  45460 non-null  float64
 5   vote_count    45460 non-null  float64
dtypes: float64(3), object(3)
memory usage: 2.1+ MB


## Date:

In [5]:
# Changing the type of release_date in to  date_type 
df['release_date'] = pd.to_datetime(df['release_date'], errors = 'coerce')

In [6]:
df['year'] = df.release_date.dt.year
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,1995.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,1995.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,1995.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,1995.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,1995.0


In [7]:
df.year.isna().sum()

90

In [8]:
# we can replace null values with 0
df.year =  df.year.replace(np.nan, 0)

In [9]:
df.year.isna().sum()

0

In [10]:
# drop release_date column
df.drop(columns = ['release_date'], inplace  = True)

## Genres:

In [11]:
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [12]:
type(df.iloc[0]['genres'])

str

we need to extract genres for each movie in another format to be able to work with it directly. but before that

In [13]:
df['genres'] = df['genres'].apply(literal_eval)


In [14]:
type(df.iloc[0]['genres'])

list

In [15]:
df['genres']	=	df['genres'].apply(lambda	x:	[i['name']	for	i	in	x]	if	isinstance(x,list)	else	[])
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995.0
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995.0
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995.0
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,1995.0
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995.0


In [16]:
#Create	a	new	feature	by	exploding	genres
s	=	df.apply(lambda	x:	pd.Series(x['genres']),axis=1).stack().reset_index(level=1,	drop=True)
s

0        Animation
0           Comedy
0           Family
1        Adventure
1          Fantasy
           ...    
45461       Family
45462        Drama
45463       Action
45463        Drama
45463     Thriller
Length: 91106, dtype: object

In [17]:
s.name = 'genre'
gen_df = df.drop(columns= ['genres']).join(s)
gen_df.head()

Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995.0,Animation
0,Toy Story,81.0,7.7,5415.0,1995.0,Comedy
0,Toy Story,81.0,7.7,5415.0,1995.0,Family
1,Jumanji,104.0,6.9,2413.0,1995.0,Adventure
1,Jumanji,104.0,6.9,2413.0,1995.0,Fantasy


# Building the chart:

In [19]:
def build_chart(gen_def, perc = .8):
  # Ask for preferred genre:
  print("Input preferred genre:")
  genre = input()

  # Ask for lower limit of duration:
  print("Input shortest duration: ")
  low_time = int(input())

  # Ask for upper limit of duration:
  print("Input longest duration: ")
  high_time = int(input())

  # Ask for lower limit of timeline
  print('Input earliest year: ')
  low_year = int(input())

  # Ask for upper limit of timeline
  print("Input latest year: ")
  high_year = int(input())

  # to avoid making changes in the original data
  movies = gen_df.copy()  

  # Filtering
  movies = movies[(movies.genre.str.lower() == genre)
                  & (movies.runtime >= low_time)
                  & (movies.runtime <= high_time)
                  & (movies.year >= low_year)
                  & (movies.year <= high_year)
                  ]

  # calculating our metric
  C = movies.vote_average.mean()
  m = movies.vote_count.quantile(perc)

  q_movies = movies.copy().loc[movies.vote_count >= m]

  # calculate score
  q_movies['score']	=	q_movies.apply(lambda	x:(x['vote_count']/(x['vote_count']+m)* x['vote_average'])	+	(m/(m+x['vote_count'])	*	C), axis = 1)
  #q_movies['score']= q_movies.apply(weighted_rating, axis = 1)

  # sorting
  q_movies = q_movies.sort_values(by = 'score', ascending = False)

  return q_movies




# Testing

In [20]:
build_chart(gen_df).head()

Input preferred genre:
animation
Input shortest duration: 
30
Input longest duration: 
120
Input earliest year: 
1990
Input latest year: 
2005


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
9698,Howl's Moving Castle,119.0,8.2,2049.0,2004.0,Animation,7.994823
359,The Lion King,89.0,8.0,5520.0,1994.0,Animation,7.926672
0,Toy Story,81.0,7.7,5415.0,1995.0,Animation,7.6375
6232,Finding Nemo,100.0,7.6,6292.0,2003.0,Animation,7.549423
546,The Nightmare Before Christmas,76.0,7.6,2135.0,1993.0,Animation,7.4605
