In [137]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import ast
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD

In [29]:
df=pd.read_csv("Main dataset.csv")

In [31]:
df.isnull().sum()

name                       0
User_Id                    0
Movie_Id                   0
rating                     0
genre                      0
year                       0
released                   0
score(IMDB)                0
score( Rotten Tomatoes)    0
score                      0
votes                      0
director                   0
star                       0
country                    0
company                    0
runtime                    0
dtype: int64

In [33]:
df.iloc[0:3].transpose()

Unnamed: 0,0,1,2
name,Mad Max: Fury Road,Avengers: Age of Ultron,Crimson Peak
User_Id,1,1,1
Movie_Id,1408,31,1029
rating,R,PG-13,R
genre,Action,Action,Drama
year,2015,2015,2015
released,"May 15, 2015 (United States)","May 1, 2015 (United States)","October 16, 2015 (United States)"
score(IMDB),7.5,7.1,6.5
score( Rotten Tomatoes),7.7,6.6,5.8
score,8.1,7.3,6.5


In [34]:
df.columns

Index(['name', 'User_Id', 'Movie_Id', 'rating', 'genre', 'year', 'released',
       'score(IMDB)', 'score( Rotten Tomatoes)', 'score', 'votes', 'director',
       'star', 'country', 'company', 'runtime'],
      dtype='object')

In [35]:
df.shape


(800, 16)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     800 non-null    object 
 1   User_Id                  800 non-null    int64  
 2   Movie_Id                 800 non-null    int64  
 3   rating                   800 non-null    object 
 4   genre                    800 non-null    object 
 5   year                     800 non-null    int64  
 6   released                 800 non-null    object 
 7   score(IMDB)              800 non-null    float64
 8   score( Rotten Tomatoes)  800 non-null    float64
 9   score                    800 non-null    float64
 10  votes                    800 non-null    int64  
 11  director                 800 non-null    object 
 12  star                     800 non-null    object 
 13  country                  800 non-null    object 
 14  company                  8

In [40]:
# this is V
vote_counts = df[df['votes'].notnull()]['votes'].astype('int')

# this is R
vote_averages = df[df['score'].notnull()]['score'].astype('int')

# this is C
C = vote_averages.mean()
C

6.115

In [41]:
m = vote_counts.quantile(0.95)
m

417049.99999999994

In [42]:
qualified = df[(df['votes'] >= m) &
               (df['votes'].notnull()) &
               (df['score'].notnull())][['name',
                                                'year',
                                                'votes',
                                                'score',
                                                'star',
                                                'genre']]

qualified['votes'] = qualified['votes'].astype('int')
qualified['score'] = qualified['score'].astype('int')
qualified.shape

(40, 6)

In [43]:
def weighted_rating(x):
    v = x['votes']
    R = x['score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [44]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [45]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [46]:
qualified.head(15)

Unnamed: 0,name,year,votes,score,star,genre,wr
205,Deadpool,2016,932000,8,Ryan Reynolds,Action,7.417265
0,Mad Max: Fury Road,2015,912000,8,Tom Hardy,Action,7.408495
601,Avengers: Infinity War,2018,897000,8,Robert Downey Jr.,Action,7.401743
12,The Martian,2015,783000,8,Matt Damon,Adventure,7.344911
11,The Revenant,2015,729000,8,Leonardo DiCaprio,Action,7.314045
418,Logan,2017,674000,8,Hugh Jackman,Action,7.279465
14,Inside Out,2015,641000,8,Amy Poehler,Animation,7.256992
217,La La Land,2016,527000,8,Ryan Gosling,Comedy,7.167269
405,Blade Runner 2049,2017,486000,8,Harrison Ford,Action,7.129462
222,Hacksaw Ridge,2016,457000,8,Andrew Garfield,Biography,7.100579


In [47]:
s = df.apply(lambda x: pd.Series(x['genre']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = df.drop('genre', axis=1).join(s)
gen_md.head(3).transpose()

Unnamed: 0,0,1,2
name,Mad Max: Fury Road,Avengers: Age of Ultron,Crimson Peak
User_Id,1,1,1
Movie_Id,1408,31,1029
rating,R,PG-13,R
year,2015,2015,2015
released,"May 15, 2015 (United States)","May 1, 2015 (United States)","October 16, 2015 (United States)"
score(IMDB),7.5,7.1,6.5
score( Rotten Tomatoes),7.7,6.6,5.8
score,8.1,7.3,6.5
votes,912000,777000,136000


In [50]:
def build_chart(genre, percentile=0.85):
    dff = gen_md[gen_md['genre'] == genre]
    vote_counts = dff[dff['votes'].notnull()]['votes'].astype('int')
    vote_averages = dff[dff['score'].notnull()]['score'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    qualified = dff[(dff['votes'] >= m) & (dff['votes'].notnull()) &
                   (dff['score'].notnull())][['name', 'year', 'votes', 'score', 'star']]
    qualified['votes'] = qualified['votes'].astype('int')
    qualified['score'] = qualified['score'].astype('int')

    qualified['wr'] = qualified.apply(lambda x:
                        (x['votes']/(x['votes']+m) * x['score']) + (m/(m+x['votes']) * C),
                        axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)

    return qualified

In [52]:
build_chart('Action').head(15)

Unnamed: 0,name,year,votes,score,star,wr
205,Deadpool,2016,932000,8,Ryan Reynolds,7.469314
0,Mad Max: Fury Road,2015,912000,8,Tom Hardy,7.460923
601,Avengers: Infinity War,2018,897000,8,Robert Downey Jr.,7.454455
11,The Revenant,2015,729000,8,Leonardo DiCaprio,7.369749
418,Logan,2017,674000,8,Hugh Jackman,7.335996
405,Blade Runner 2049,2017,486000,8,Harrison Ford,7.187209
20,Star Wars: Episode VII - The Force Awakens,2015,876000,7,Daisy Ridley,6.732358
1,Avengers: Age of Ultron,2015,777000,7,Robert Downey Jr.,6.70891
203,Captain America: Civil War,2016,694000,7,Chris Evans,6.685834
604,Black Panther,2018,661000,7,Chadwick Boseman,6.67561


In [56]:
links_small = df[df['Movie_Id'].notnull()]['Movie_Id'].astype('int')

In [59]:
smd = df[df['Movie_Id'].isin(links_small)]
smd.shape

(800, 16)

In [73]:
smd['country'] = smd['country'].fillna('')
smd['company'] = smd['company'].fillna('')
smd['description'] = smd['country'] + smd['company']
smd['description'] = smd['description'].fillna('')

In [74]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [75]:
tfidf_matrix.shape

(800, 1527)

In [76]:
# http://scikit-learn.org/stable/modules/metrics.html#linear-kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [77]:
cosine_sim[0]
#cosine_sim.shape

array([1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.19271375, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.19271375, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.19271375, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.19271375, 0.        , 0.        , 0.     

In [78]:
smd = smd.reset_index()
titles = smd['name']
indices = pd.Series(smd.index, index=smd['name'])

In [79]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [80]:
get_recommendations('Avengers: Age of Ultron').head(10)

203         Captain America: Civil War
206                     Doctor Strange
406     Guardians of the Galaxy Vol. 2
601             Avengers: Infinity War
604                      Black Panther
614               Ant-Man and the Wasp
250              Manchester by the Sea
501                       Wonder Wheel
539                   Last Flag Flying
550    The Only Living Boy in New York
Name: name, dtype: object

In [81]:
get_recommendations('Ant-Man').head(10)

69                            Get Hard
533                          The House
114         Sleeping with Other People
67                            The Gift
167                 The Lazarus Effect
350            In a Valley of Violence
597    The Resurrection of Gavin Stone
637                          Halloween
639                    The First Purge
712                      Truth or Dare
Name: name, dtype: object

In [82]:
get_recommendations('X-Men: Apocalypse').head(10)

34                                  Fantastic Four
70                                Hitman: Agent 47
205                                       Deadpool
213                              X-Men: Apocalypse
231                   Independence Day: Resurgence
233    Miss Peregrine's Home for Peculiar Children
294               Mike and Dave Need Wedding Dates
328                                       Why Him?
418                                          Logan
420                                Alien: Covenant
Name: name, dtype: object

In [94]:
smd['star'] = smd['star'].fillna('')
smd['director'] = smd['director'].fillna('')
smd['soup'] = smd['star'] + smd['director'] + smd['genre']

In [95]:
smd['soup']

0                     Tom HardyGeorge MillerAction
1               Robert Downey Jr.Joss WhedonAction
2            Mia WasikowskaGuillermo del ToroDrama
3                        Vin DieselJames WanAction
4          Samuel L. JacksonQuentin TarantinoCrime
                          ...                     
795    Kristoffer JonerJohn Andreas AndersenAction
796               Tiffany HaddishTyler PerryComedy
797       Batsheva Dance CompanyJosé PadilhaAction
798         Daiki YamashitaKenji NagasakiAnimation
799                David StrathairnPerry LangDrama
Name: soup, Length: 800, dtype: object

In [98]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [100]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [129]:
smd = smd.reset_index(drop=True)
titles = smd['name']
indices = pd.Series(smd.index, index=smd['name'])

In [130]:
get_recommendations('Ant-Man').head(10)

614       Ant-Man and the Wasp
292             Swiss Army Man
584              Journey's End
745      The Catcher Was a Spy
0           Mad Max: Fury Road
1      Avengers: Age of Ultron
2                 Crimson Peak
3                    Furious 7
4            The Hateful Eight
6         Fifty Shades of Grey
Name: name, dtype: object

In [131]:
get_recommendations('Avengers: Age of Ultron').head(10)

601    Avengers: Infinity War
29                 The Intern
120       A Walk in the Woods
286             Dirty Grandpa
445                 Good Time
692                 High Life
740     The Old Man & the Gun
620     Under the Silver Lake
53     Straight Outta Compton
94           The Longest Ride
Name: name, dtype: object

In [132]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['name', 'votes', 'score', 'year']]
    vote_counts = movies[movies['votes'].notnull()]['votes'].astype('int')
    vote_averages = movies[movies['score'].notnull()]['score'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['votes'] >= m) & (movies['votes'].notnull()) &
                       (movies['score'].notnull())]
    qualified['votes'] = qualified['votes'].astype('int')
    qualified['score'] = qualified['score'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [133]:
improved_recommendations('Avengers: Age of Ultron')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['votes'] = qualified['votes'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['score'] = qualified['score'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, axis=1)


Unnamed: 0,name,votes,score,year,wr
0,Mad Max: Fury Road,912000,8,2015,7.408495
601,Avengers: Infinity War,897000,8,2018,7.401743
11,The Revenant,729000,8,2015,7.314045
5,Ant-Man,595000,7,2015,6.635305
10,Jurassic World,593000,7,2015,6.634583
4,The Hateful Eight,539000,7,2015,6.613944
7,Sicario,386000,7,2015,6.540391
9,The Big Short,381000,7,2015,6.537511
3,Furious 7,370000,7,2015,6.531047
6,Fifty Shades of Grey,304000,4,2015,5.2233


In [134]:
improved_recommendations('Ant-Man')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['votes'] = qualified['votes'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['score'] = qualified['score'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, axis=1)


Unnamed: 0,name,votes,score,year,wr
0,Mad Max: Fury Road,912000,8,2015,7.408495
12,The Martian,783000,8,2015,7.344911
11,The Revenant,729000,8,2015,7.314045
14,Inside Out,641000,8,2015,7.256992
15,Spotlight,435000,8,2015,7.077355
13,Room,386000,8,2015,7.021058
20,Star Wars: Episode VII - The Force Awakens,876000,7,2015,6.714559
1,Avengers: Age of Ultron,777000,7,2015,6.690893
10,Jurassic World,593000,7,2015,6.634583
4,The Hateful Eight,539000,7,2015,6.613944


In [138]:
# surprise reader API to read the dataset
reader = Reader()

In [142]:
data = Dataset.load_from_df(df[['User_Id', 'Movie_Id', 'score']], reader)
from surprise.model_selection import train_test_split

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=5)  # You can adjust the test_size as needed

# Now you can use trainset and testset for training and evaluating your model


In [144]:
from surprise.model_selection import cross_validate
# Define your model (SVD in this case)
svd = SVD()
# Perform cross-validation
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# Print the results
print(results)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.7581  1.8242  1.7318  1.8157  1.6957  1.7651  0.0490  
MAE (testset)     1.5938  1.6569  1.5519  1.6594  1.5269  1.5978  0.0537  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
{'test_rmse': array([1.758124  , 1.82416077, 1.73178015, 1.81571267, 1.69567464]), 'test_mae': array([1.59375 , 1.656875, 1.551875, 1.659375, 1.526875]), 'fit_time': (0.011505126953125, 0.007996559143066406, 0.007854700088500977, 0.008374691009521484, 0.009116172790527344), 'test_time': (0.0013573169708251953, 0.0007953643798828125, 0.0007731914520263672, 0.0008752346038818359, 0.0009400844573974609)}
