The task would be to recommend movies to the user based on him/her given movies.

Movies will be given by title.

In [1]:
import numpy as np
import pandas as pd

Start by inspecting our dataset

In [2]:
links_df = pd.read_csv('data/links.csv')
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [3]:
movies_df = pd.read_csv('data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df = pd.read_csv('data/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags_df = pd.read_csv('data/tags.csv')
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Let's go on the assumption that if person $A$ likes movies $M_0, M_1, ..., M_i$,
then there goes person $B$ who likes one or more movies from $M_i$ let's call them $M_j$.

This would mean that $A$ and $B$ has a movie that they both liked, therefore other movies from both $M_j$ and $M_i$ can be liked by both $A$ and $B$ with high probability.

------------------

In [33]:
#In above line lines we used a family of algorithms known as collaborative
#filtering,it is possible to discover comparable individuals or products
#and generate results based on ratings of those users.

In [7]:
df = movies_df.merge(ratings_df, on='movieId')
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [8]:
M_j = 'John Wick (2014)' # Title as input, now it's just one movie name.
recommended_movies = []

# Find the movie in the database, and sort it by rating
movie_db = df[df['title'] == M_j]\
            .sort_values(by='rating', ascending=False)

# Get the first 5 users who liked this movie
for user in movie_db.iloc[:5]['userId'].values:
    
    # Get the rated movies for this user
    rated_movies = df[df['userId'] == user]
    
    # Get the five biggest rated movie by this user
    rated_movies = rated_movies[rated_movies['title'] != M_j]\
                    .sort_values(by='rating', ascending=False)\
                    .iloc[:5]
    
    # Add these to the recommendations
    recommended_movies.extend(list(rated_movies['title'].values))
    
recommended_movies = np.unique(recommended_movies)
    
for movie in recommended_movies:
    print(movie)

21 Jump Street (2012)
Addams Family, The (1991)
Aladdin (1992)
Batman Begins (2005)
Boondock Saints II: All Saints Day, The (2009)
Captain America: Civil War (2016)
Deadpool (2016)
Fight Club (1999)
Green Mile, The (1999)
Indiana Jones and the Temple of Doom (1984)
Jackass 2.5 (2007)
Jungle Book, The (1967)
King's Speech, The (2010)
Kingsman: The Secret Service (2015)
Opera (1987)
Pan's Labyrinth (Laberinto del fauno, El) (2006)
Predestination (2014)
Suspiria (1977)
The Godfather Trilogy: 1972-1990 (1992)
Toy Story (1995)
Visitor Q (Bizita Q) (2001)
Willow (1988)


In [9]:
recommended_movies = np.unique(recommended_movies)
print(recommended_movies)

['21 Jump Street (2012)' 'Addams Family, The (1991)' 'Aladdin (1992)'
 'Batman Begins (2005)' 'Boondock Saints II: All Saints Day, The (2009)'
 'Captain America: Civil War (2016)' 'Deadpool (2016)' 'Fight Club (1999)'
 'Green Mile, The (1999)' 'Indiana Jones and the Temple of Doom (1984)'
 'Jackass 2.5 (2007)' 'Jungle Book, The (1967)'
 "King's Speech, The (2010)" 'Kingsman: The Secret Service (2015)'
 'Opera (1987)' "Pan's Labyrinth (Laberinto del fauno, El) (2006)"
 'Predestination (2014)' 'Suspiria (1977)'
 'The Godfather Trilogy: 1972-1990 (1992)' 'Toy Story (1995)'
 'Visitor Q (Bizita Q) (2001)' 'Willow (1988)']


Now weight each movie by the similiarity on the genre feature

In [10]:
gmovie_genres = df[df['title'] == M_j].iloc[0]['genres'].split('|')
scores = {}  # {title: score ...}

for movie in recommended_movies:
    movied = df[df['title'] == movie].iloc[0]
    movie_genres = movied['genres'].split('|')
    score = 0
    
    # How many gmovie_genre can be found in movie_genres?
    for gmovie_genre in gmovie_genres:
        if gmovie_genre in movie_genres:
            score += 1
    
    scores[movie] = score
    
# Sort them on score and reverse it, because the bigger the score the better 
recommended_movies = sorted(scores, key=lambda x: scores[x])[::-1]  

The recommendations are now weighted

In [11]:
for movie in recommended_movies:
    print(movie)

Predestination (2014)
Fight Club (1999)
Captain America: Civil War (2016)
Boondock Saints II: All Saints Day, The (2009)
Willow (1988)
Pan's Labyrinth (Laberinto del fauno, El) (2006)
Kingsman: The Secret Service (2015)
Indiana Jones and the Temple of Doom (1984)
Deadpool (2016)
Batman Begins (2005)
21 Jump Street (2012)
Visitor Q (Bizita Q) (2001)
Toy Story (1995)
The Godfather Trilogy: 1972-1990 (1992)
Suspiria (1977)
Opera (1987)
King's Speech, The (2010)
Jungle Book, The (1967)
Jackass 2.5 (2007)
Green Mile, The (1999)
Aladdin (1992)
Addams Family, The (1991)


In [12]:
df[0:10]#to select particular rows.

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18,3.5,1455209816
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,4.0,965705637
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,21,3.5,1407618878
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27,3.0,962685262
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31,5.0,850466616


In [13]:
#checking particular column type.
df['title'].dtype

dtype('O')

In [14]:
#checking types for all columns
df.dtypes

movieId        int64
title         object
genres        object
userId         int64
rating       float64
timestamp      int64
dtype: object

In [15]:
df.describe()

Unnamed: 0,movieId,userId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,19435.295718,326.127564,3.501557,1205946000.0
std,35530.987199,182.618491,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,1199.0,177.0,3.0,1019124000.0
50%,2991.0,325.0,3.5,1186087000.0
75%,8122.0,477.0,4.0,1435994000.0
max,193609.0,610.0,5.0,1537799000.0


In [16]:
df.max()

movieId                                         193609
title        À nous la liberté (Freedom for Us) (1931)
genres                                         Western
userId                                             610
rating                                             5.0
timestamp                                   1537799250
dtype: object

In [17]:
df.min()

movieId                       1
title                '71 (2014)
genres       (no genres listed)
userId                        1
rating                      0.5
timestamp             828124615
dtype: object

In [21]:
df['rating'].max()

5.0

In [24]:
df[df['rating']==df['rating'].max()]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31,5.0,850466616
12,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,40,5.0,832058959
13,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,43,5.0,848993983
16,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,46,5.0,834787906
19,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,57,5.0,965796031
...,...,...,...,...,...,...
100787,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,98,5.0,1532457913
100790,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,249,5.0,1531611534
100791,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,305,5.0,1532877841
100801,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,586,5.0,1529899556


In [25]:
df['rating'].mean()

3.501556983616962

In [26]:
df['rating'].median()

3.5

In [27]:
df['rating'].std()

1.042529239060635

In [28]:
#if we need to select a range of rows by their labels then we use loc.
movies_df.loc[10:20,['title','movieId','genres']]

Unnamed: 0,title,movieId,genres
10,"American President, The (1995)",11,Comedy|Drama|Romance
11,Dracula: Dead and Loving It (1995),12,Comedy|Horror
12,Balto (1995),13,Adventure|Animation|Children
13,Nixon (1995),14,Drama
14,Cutthroat Island (1995),15,Action|Adventure|Romance
15,Casino (1995),16,Crime|Drama
16,Sense and Sensibility (1995),17,Drama|Romance
17,Four Rooms (1995),18,Comedy
18,Ace Ventura: When Nature Calls (1995),19,Comedy
19,Money Train (1995),20,Action|Comedy|Crime|Drama|Thriller


In [29]:
#if we need to select a range of rows or columns using their positions
#we use iloc.
tags_df.iloc[10:20,[0,1,2,3]]

Unnamed: 0,userId,movieId,tag,timestamp
10,18,431,Al Pacino,1462138765
11,18,431,gangster,1462138749
12,18,431,mafia,1462138755
13,18,1221,Al Pacino,1461699306
14,18,1221,Mafia,1461699303
15,18,5995,holocaust,1455735472
16,18,5995,true story,1455735479
17,18,44665,twist ending,1456948283
18,18,52604,Anthony Hopkins,1457650696
19,18,52604,courtroom drama,1457650711


In [31]:
#sorting by particular values.
df_sorted=df.sort_values(by='rating')
df_sorted.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
30953,1380,Grease (1978),Comedy|Musical|Romance,328,0.5,1494211348
62766,4621,Look Who's Talking (1989),Comedy|Romance,608,0.5,1117506916
55480,3564,"Flintstones in Viva Rock Vegas, The (2000)",Children|Comedy,517,0.5,1488398787
65364,5013,Gosford Park (2001),Comedy|Drama|Mystery,22,0.5,1268727056
82233,44972,Scary Movie 4 (2006),Comedy|Horror,111,0.5,1516151609


In [32]:
df_sorted

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
30953,1380,Grease (1978),Comedy|Musical|Romance,328,0.5,1494211348
62766,4621,Look Who's Talking (1989),Comedy|Romance,608,0.5,1117506916
55480,3564,"Flintstones in Viva Rock Vegas, The (2000)",Children|Comedy,517,0.5,1488398787
65364,5013,Gosford Park (2001),Comedy|Drama|Mystery,22,0.5,1268727056
82233,44972,Scary Movie 4 (2006),Comedy|Horror,111,0.5,1516151609
...,...,...,...,...,...,...
66388,5299,My Big Fat Greek Wedding (2002),Comedy|Romance,542,5.0,1163387291
29333,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure,45,5.0,950726569
16653,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,304,5.0,891173910
47782,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery,367,5.0,997811157
