In [7]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

metadata = 'movies_metadata.csv'
ratings = 'ratings.csv'
rt = pd.read_csv(ratings)
md = pd.read_csv(metadata)


In [8]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [9]:
null = md['genres'].isnull().sum()

print(f'Null Count: ', {null})

Null Count:  {0}


In [11]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [12]:
rt.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110.0,1.0,1425942000.0
1,1,147.0,4.5,1425942000.0
2,1,858.0,5.0,1425942000.0
3,1,1221.0,5.0,1425942000.0
4,1,1246.0,5.0,1425942000.0


In [13]:
rt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1416158 entries, 0 to 1416157
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1416158 non-null  int64  
 1   movieId    1416157 non-null  float64
 2   rating     1416157 non-null  float64
 3   timestamp  1416157 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 43.2 MB


In [24]:
vote_mean = md[md['vote_average'].notnull()]['vote_average'].mean()

In [15]:
vars = md[md['vote_count'].notnull()][['vote_average', 'vote_count']]

In [16]:
vars['vote_mean'] = vote_mean

In [17]:
vars.head()

Unnamed: 0,vote_average,vote_count,vote_mean
0,7.7,5415.0,5.618207
1,6.9,2413.0,5.618207
2,6.5,92.0,5.618207
3,6.1,34.0,5.618207
4,5.7,173.0,5.618207


In [18]:
percentile_95 = vars['vote_count'].quantile(0.95)

In [23]:
percentile_95

434.0

In [20]:
vars['weighted_rating'] = (vars['vote_count'] * vars['vote_average'] + percentile_95 * vars['vote_mean']) / (vars['vote_count'] + percentile_95)

In [22]:
vars.head()

Unnamed: 0,vote_average,vote_count,vote_mean,weighted_rating
0,7.7,5415.0,5.618207,7.545529
1,6.9,2413.0,5.618207,6.704602
2,6.5,92.0,5.618207,5.772437
3,6.1,34.0,5.618207,5.653209
4,5.7,173.0,5.618207,5.641519


In [29]:
topmovies = vars[(vars['vote_count'] >= percentile_95)]

In [30]:
topmovies

Unnamed: 0,vote_average,vote_count,vote_mean,weighted_rating
0,7.7,5415.0,5.618207,7.545529
1,6.9,2413.0,5.618207,6.704602
5,7.7,1886.0,5.618207,7.310561
9,6.6,1194.0,5.618207,6.338269
15,7.8,1343.0,5.618207,7.267137
...,...,...,...,...
44624,7.3,598.0,5.618207,6.592734
44632,6.1,748.0,5.618207,5.923098
44678,7.5,2712.0,5.618207,7.240401
44842,6.2,1440.0,5.618207,6.065263


In [31]:
badmovies = topmovies - vars

In [32]:
badmovies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45460 entries, 0 to 45465
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   vote_average     2274 non-null   float64
 1   vote_count       2274 non-null   float64
 2   vote_mean        2274 non-null   float64
 3   weighted_rating  2274 non-null   float64
dtypes: float64(4)
memory usage: 1.7 MB
