In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("imdb_top_1000.csv")

In [3]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


# excluding excess feature

In [4]:
new_df = df.drop(['Poster_Link', 'Runtime','Certificate', 'Overview', 'Released_Year','No_of_Votes'],axis=1)

In [5]:
new_df.dtypes

Series_Title     object
Genre            object
IMDB_Rating     float64
Meta_score      float64
Director         object
Star1            object
Star2            object
Star3            object
Star4            object
Gross            object
dtype: object

#  Checking data for null values

In [6]:
new_df.isna().sum()

Series_Title      0
Genre             0
IMDB_Rating       0
Meta_score      157
Director          0
Star1             0
Star2             0
Star3             0
Star4             0
Gross           169
dtype: int64

# Imputation (meta-score[null] = imdb-score * 10):

In [7]:
def fill_na_with_imdb(row):
    if pd.isna(row['Meta_score']):
        return row['IMDB_Rating'] * 10
    else:
        return row['Meta_score']

new_df['Meta_score'] = new_df.apply(fill_na_with_imdb, axis=1)

# # Imputation (gross[null] = gross.mean()):

In [8]:
new_df['Gross'].fillna(0)
new_df['Gross'] = new_df['Gross'].str.replace(',', '')
new_df['Gross'] = new_df['Gross'].astype(float)
gross_mean = new_df['Gross'].mean()
new_df['Gross'] = new_df['Gross'].fillna(gross_mean)
new_df['Gross'].astype(int)

0       28341469
1      134966411
2      534858444
3       57300000
4        4360000
         ...    
995     68034750
996     68034750
997     30500000
998     68034750
999     68034750
Name: Gross, Length: 1000, dtype: int32

In [9]:
new_df.isna().sum()

Series_Title    0
Genre           0
IMDB_Rating     0
Meta_score      0
Director        0
Star1           0
Star2           0
Star3           0
Star4           0
Gross           0
dtype: int64

In [10]:
new_df.head()

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,Gross
0,The Shawshank Redemption,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,28341469.0
1,The Godfather,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,134966411.0
2,The Dark Knight,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,534858444.0
3,The Godfather: Part II,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,57300000.0
4,12 Angry Men,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,4360000.0


# # now we gather actors (star1, star2, ...) and director in one column named Act_Drct so that we can create a column for each actor/director and also for each genre using get_dummies method:

In [11]:
new_df["Act_Drct"] = new_df['Star1'].astype(str) + ", " + new_df['Star2'].astype(str)+ ', ' + new_df["Star3"].astype(str)+ ', ' + new_df["Star4"].astype(str) + ', ' + new_df['Director'].astype(str)
new_df.drop(['Star1','Star2','Star3','Star4','Director'], axis=1, inplace=True)
new_df.head()

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Meta_score,Gross,Act_Drct
0,The Shawshank Redemption,Drama,9.3,80.0,28341469.0,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi..."
1,The Godfather,"Crime, Drama",9.2,100.0,134966411.0,"Marlon Brando, Al Pacino, James Caan, Diane Ke..."
2,The Dark Knight,"Action, Crime, Drama",9.0,84.0,534858444.0,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,The Godfather: Part II,"Crime, Drama",9.0,90.0,57300000.0,"Al Pacino, Robert De Niro, Robert Duvall, Dian..."
4,12 Angry Men,"Crime, Drama",9.0,96.0,4360000.0,"Henry Fonda, Lee J. Cobb, Martin Balsam, John ..."


In [12]:
genres = new_df['Genre'].str.get_dummies(', ')
new_df = pd.concat([new_df, genres], axis=1)
new_df.head()

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Meta_score,Gross,Act_Drct,Action,Adventure,Animation,Biography,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,The Shawshank Redemption,Drama,9.3,80.0,28341469.0,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Godfather,"Crime, Drama",9.2,100.0,134966411.0,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Dark Knight,"Action, Crime, Drama",9.0,84.0,534858444.0,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Godfather: Part II,"Crime, Drama",9.0,90.0,57300000.0,"Al Pacino, Robert De Niro, Robert Duvall, Dian...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12 Angry Men,"Crime, Drama",9.0,96.0,4360000.0,"Henry Fonda, Lee J. Cobb, Martin Balsam, John ...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
Act_Drct = new_df['Act_Drct'].str.get_dummies(', ')
new_df = pd.concat([new_df, Act_Drct], axis=1)
new_df.drop(['Genre', 'Act_Drct'],axis=1, inplace=True)
new_df.head()

Unnamed: 0,Series_Title,IMDB_Rating,Meta_score,Gross,Action,Adventure,Animation,Biography,Comedy,Crime,...,Zooey Deschanel,Zoya Akhtar,Zoë Kravitz,Álvaro Guerrero,Çagan Irmak,Çetin Tekindor,Émile Vallée,Éric Toledano,Ömer Faruk Sorak,Özge Özberk
0,The Shawshank Redemption,9.3,80.0,28341469.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Godfather,9.2,100.0,134966411.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,The Dark Knight,9.0,84.0,534858444.0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,The Godfather: Part II,9.0,90.0,57300000.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12 Angry Men,9.0,96.0,4360000.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Creating a new DF with several movies with scores so that the model can give us recommendation based on this user's taste:

In [14]:
sample_user_fav = pd.DataFrame() 
sample_user_fav = new_df[new_df['Series_Title'].isin(['Inception','Interstellar','Whiplash','Coraline','Up','Soul','Shutter Island','Se7en','Fight Club','The Shawshank Redemption','The Dark Knight'])]
sample_user_fav

Unnamed: 0,Series_Title,IMDB_Rating,Meta_score,Gross,Action,Adventure,Animation,Biography,Comedy,Crime,...,Zooey Deschanel,Zoya Akhtar,Zoë Kravitz,Álvaro Guerrero,Çagan Irmak,Çetin Tekindor,Émile Vallée,Éric Toledano,Ömer Faruk Sorak,Özge Özberk
0,The Shawshank Redemption,9.3,80.0,28341470.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Dark Knight,9.0,84.0,534858400.0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,Inception,8.8,74.0,292576200.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Fight Club,8.8,66.0,37030100.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,Interstellar,8.6,74.0,188020000.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27,Se7en,8.6,65.0,100125600.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
34,Whiplash,8.5,88.0,13092000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
145,Shutter Island,8.2,63.0,128012900.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
146,Up,8.2,88.0,293004200.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
205,Soul,8.1,83.0,68034750.0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# here's the matrix of user's fave content:

In [16]:
user_rates = sample_user_fav.drop(['IMDB_Rating','Meta_score','Gross','Series_Title'],axis=1)
user_rates.head()

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,...,Zoya Akhtar,Zoë Kravitz,Álvaro Guerrero,Çagan Irmak,Çetin Tekindor,Émile Vallée,Éric Toledano,Ömer Faruk Sorak,Özge Özberk,Rating
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,8.7
2,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,8.9
8,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10.0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,9.3
21,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,9.5


# Using the transpose function we can evaluate the content matrix with the scores our user provided:

In [17]:
user_profile = user_rates.transpose().dot(sample_user_fav['Rating'])
user_profile.sort_values(ascending=False).head(20)

Rating                  885.71
Drama                    62.90
Adventure                36.10
Christopher Nolan        28.40
Animation                25.60
Sci-Fi                   19.50
Leonardo DiCaprio        19.00
Action                   18.90
David Fincher            18.70
Brad Pitt                18.70
Mystery                  18.40
Crime                    18.30
Morgan Freeman           18.10
Comedy                   16.60
Pete Docter              16.60
Ken Watanabe             10.00
Elliot Page              10.00
Joseph Gordon-Levitt     10.00
Mackenzie Foy             9.50
Matthew McConaughey       9.50
dtype: float64

In [19]:
ndf = new_df.drop(['IMDB_Rating','Meta_score','Gross','Series_Title'],axis=1)
ndf.head()

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,...,Zooey Deschanel,Zoya Akhtar,Zoë Kravitz,Álvaro Guerrero,Çagan Irmak,Çetin Tekindor,Émile Vallée,Éric Toledano,Ömer Faruk Sorak,Özge Özberk
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Here we calculate movies from dataset which get the highest scores based on the user's fave contents:

In [20]:
recommendationTable = ((ndf*user_profile).sum(axis=1))/(user_profile.sum())
recommendationTable.sort_values(ascending=False).head(20)

21     0.114093
27     0.107305
2      0.101258
8      0.093730
36     0.090707
783    0.087930
343    0.084474
741    0.082747
205    0.080032
9      0.079106
329    0.078983
496    0.078921
513    0.078612
767    0.078551
768    0.078489
361    0.078366
59     0.078304
146    0.078057
332    0.076885
986    0.076885
dtype: float64

In [21]:
recommendationTable = recommendationTable.sort_values(ascending=False)
recommendationTable.head()

21    0.114093
27    0.107305
2     0.101258
8     0.093730
36    0.090707
dtype: float64

In [22]:
recommendation = new_df.loc[ndf.index.isin(recommendationTable.head(100).keys())]

In [23]:
recommendation = recommendation[~recommendation['Series_Title'].isin(sample_user_fav['Series_Title'])]

In [24]:
recommendation.head(50)

Unnamed: 0,Series_Title,IMDB_Rating,Meta_score,Gross,Action,Adventure,Animation,Biography,Comedy,Crime,...,Zooey Deschanel,Zoya Akhtar,Zoë Kravitz,Álvaro Guerrero,Çagan Irmak,Çetin Tekindor,Émile Vallée,Éric Toledano,Ömer Faruk Sorak,Özge Özberk
5,The Lord of the Rings: The Return of the King,8.9,94.0,377845900.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,The Lord of the Rings: The Fellowship of the Ring,8.8,92.0,315544800.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,The Lord of the Rings: The Two Towers,8.7,87.0,342551400.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,Seppuku,8.6,85.0,68034750.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,Shichinin no samurai,8.6,98.0,269061.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,The Prestige,8.5,66.0,53089890.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37,The Departed,8.5,85.0,132384300.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
39,Gladiator,8.5,67.0,187705400.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43,The Lion King,8.5,88.0,422783800.0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,Avengers: Endgame,8.4,78.0,858373000.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
recommendation = df[df['Series_Title'].isin(recommendation['Series_Title'])]

In [34]:
recommendation.reset_index(inplace=True,drop=True)
print('movies recommended for sample user:\n\n', recommendation[['Title','IMDB']].head(15))


movies recommended for sample user:

                                                 Title  IMDB
0       The Lord of the Rings: The Return of the King   8.9
1   The Lord of the Rings: The Fellowship of the Ring   8.8
2               The Lord of the Rings: The Two Towers   8.7
3                                             Seppuku   8.6
4                                Shichinin no samurai   8.6
5                                        The Prestige   8.5
6                                        The Departed   8.5
7                                           Gladiator   8.5
8                                       The Lion King   8.5
9                                   Avengers: Endgame   8.4
10                              The Dark Knight Rises   8.4
11                                           Oldeuboi   8.4
12                               Inglourious Basterds   8.3
13                                           Das Boot   8.3
14                                 A Clockwork Orange   8.3
