<a href="https://colab.research.google.com/github/Campeone/Learning-Deep-Learning/blob/main/IMDb_Movies_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Movie lens dataset

In [None]:
#import necessary library 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
# Read in the dataset 
movies = pd.read_csv('/content/drive/MyDrive/Recommender systems /ml-latest-small/movies.csv') 
ratings = pd.read_csv('/content/drive/MyDrive/Recommender systems /ml-latest-small/ratings.csv') 
links = pd.read_csv('/content/drive/MyDrive/Recommender systems /ml-latest-small/links.csv') 
tags = pd.read_csv('/content/drive/MyDrive/Recommender systems /ml-latest-small/tags.csv')

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [None]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movies.shape

(9742, 3)

In [None]:
movies.duplicated().sum()

0

In [None]:
tags['tag'].value_counts()

In Netflix queue     131
atmospheric           36
thought-provoking     24
superhero             24
funny                 23
                    ... 
small towns            1
In Your Eyes           1
Lloyd Dobbler          1
weak plot              1
Heroic Bloodshed       1
Name: tag, Length: 1589, dtype: int64

There are different ways to carry out content-based recommendations with this dataset based on different features 
- 1) Movie title 
- 2) Movie Genre 
- 3) Movie tags. 

For collaborative based, we used the Movie ratings column.

### MOVIE RECOMMENDATION BASED ON GENRE

In [None]:

# split genre column by '|'
movies['genres'] = movies['genres'].str.split('|')

In [None]:
# convert the genre array into string type
movies['genres'] = movies['genres'].fillna("").astype('str')

In [None]:
movies['genres'].head()

0    ['Adventure', 'Animation', 'Children', 'Comedy...
1                 ['Adventure', 'Children', 'Fantasy']
2                                ['Comedy', 'Romance']
3                       ['Comedy', 'Drama', 'Romance']
4                                           ['Comedy']
Name: genres, dtype: object

### Vectorize the text

In [None]:
# import the necessary module 
from sklearn.feature_extraction.text import TfidfVectorizer 

# instantiate 
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english') 

# fit transform 
tf_matrix = tf.fit_transform(movies['genres']) 

tf_matrix.shape

(9742, 177)

In [None]:
tf_matrix[:5]

<5x177 sparse matrix of type '<class 'numpy.float64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [None]:
tf_matrix

<9742x177 sparse matrix of type '<class 'numpy.float64'>'
	with 36628 stored elements in Compressed Sparse Row format>

### Compute cosine similarity

In [None]:
# import necessary library 
from sklearn.metrics.pairwise import cosine_similarity 

# compute similarity 
cos_sim = cosine_similarity(tf_matrix, tf_matrix) 

cos_sim[:6, :6]

array([[1.        , 0.31379419, 0.0611029 , 0.05271111, 0.16761358,
        0.        ],
       [0.31379419, 1.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407, 0.36454626,
        0.        ],
       [0.05271111, 0.        , 0.35172407, 1.        , 0.31447995,
        0.        ],
       [0.16761358, 0.        , 0.36454626, 0.31447995, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        ]])

### Get recommendation

In [None]:
titles = movies['title'] 
indices = pd.Series(movies.index, index = movies['title'])

In [87]:
title.tail()

9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, dtype: object

In [81]:
def get_recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:13]
    movie_indices = [i[0] for i in sim_scores]
    return pd.DataFrame(titles.iloc[movie_indices])

In [82]:
get_recommendation('Jumanji (1995)')

Unnamed: 0,title
53,"Indian in the Cupboard, The (1995)"
109,"NeverEnding Story III, The (1994)"
767,Escape to Witch Mountain (1975)
1514,Darby O'Gill and the Little People (1959)
1556,Return to Oz (1985)
1617,"NeverEnding Story, The (1984)"
1618,"NeverEnding Story II: The Next Chapter, The (1..."
1799,Santa Claus: The Movie (1985)
3574,Harry Potter and the Sorcerer's Stone (a.k.a. ...
6075,"Chronicles of Narnia: The Lion, the Witch and ..."


In [88]:
get_recommendation('Bungo Stray Dogs: Dead Apple (2018)').head()

Unnamed: 0,title
7380,Batman: Under the Red Hood (2010)
7896,Superman/Doomsday (2007)
8080,"Batman: The Dark Knight Returns, Part 2 (2013)"
8931,Mortal Kombat: The Journey Begins (1995)
9740,Bungo Stray Dogs: Dead Apple (2018)


In [89]:
get_recommendation('Santa Claus: The Movie (1985)').head()

Unnamed: 0,title
53,"Indian in the Cupboard, The (1995)"
109,"NeverEnding Story III, The (1994)"
767,Escape to Witch Mountain (1975)
1514,Darby O'Gill and the Little People (1959)
1556,Return to Oz (1985)


In [90]:
get_recommendation('Waiting to Exhale (1995)').head()

Unnamed: 0,title
10,"American President, The (1995)"
47,Mighty Aphrodite (1995)
52,"Postman, The (Postino, Il) (1994)"
83,Beautiful Girls (1996)
165,Something to Talk About (1995)


### MOVIE RECOMMENDATION BASED ON TITLE

In [91]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']"
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
3,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']"
4,5,Father of the Bride Part II (1995),['Comedy']


In [99]:
# remove the year and the parentheses from the movie name 
import re 
movies['title'] = movies['title'].str.replace(r'[^\w\s]|\d', '')

  movies['title'] = movies['title'].str.replace(r'[^\w\s]|\d', '')


In [100]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']"
2,3,Grumpier Old Men,"['Comedy', 'Romance']"
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']"
4,5,Father of the Bride Part II,['Comedy']


### Vectorize the text

In [102]:
# import necessary modules 
from sklearn.feature_extraction.text import TfidfVectorizer 

# instantiate 
tf2 = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english') 
  
tf2_mat = tf2.fit_transform(movies['title'])

# shape
tf2_mat.shape

(9742, 20343)

### Compute Cosine similarity

In [106]:
# import necessary modules 
from sklearn.metrics.pairwise import cosine_similarity 

similarity = cosine_similarity(tf2_mat, tf2_mat) 

similarity[22:26, 23:26]

array([[0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [107]:
titles_t = movies['title']
indices_t = pd.Series(movies.index, index = movies['title'])

In [109]:
titles_t.tail()

9737    Black Butler Book of the Atlantic 
9738                 No Game No Life Zero 
9739                                Flint 
9740          Bungo Stray Dogs Dead Apple 
9741          Andrew Dice Clay Dice Rules 
Name: title, dtype: object

In [129]:
def get_recommendation_based_on_title(titlle):
    idx_t = indices_t[titlle]
    sim_scores_t = list(enumerate(similarity[idx_t]))
    sim_scores_t = sorted(sim_scores_t, key=lambda x: x[1], reverse=True)[1:13]
    movie_indices_t = [i[0] for i in sim_scores_t]
    return pd.DataFrame(titles.iloc[movie_indices_t])

In [130]:
get_recommendation_based_on_title('Toy Story ')

Unnamed: 0,title
2355,Toy Story 2 (1999)
7355,Toy Story 3 (2010)
3595,"Toy, The (1982)"
2227,"Story of Us, The (1999)"
4089,Toy Soldiers (1991)
3187,Love Story (1970)
1570,L.A. Story (1991)
2110,"Christmas Story, A (1983)"
4047,Ghost Story (1981)
8736,True Story (2015)


In [132]:
get_recommendation_based_on_title('Jumanji ')

Unnamed: 0,title
9636,Jumanji: Welcome to the Jungle (2017)
0,Toy Story (1995)
2,Grumpier Old Men (1995)
3,Waiting to Exhale (1995)
4,Father of the Bride Part II (1995)
5,Heat (1995)
6,Sabrina (1995)
7,Tom and Huck (1995)
8,Sudden Death (1995)
9,GoldenEye (1995)
