In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [20]:
# Load Movies Metadata
movies = pd.read_csv('dataset/Movies.csv', low_memory=False)

# Print the first three rows
movies.head(10)

Unnamed: 0,id,original_language,original_title,overview,popularity,release_date,title,vote_average,vote_count
0,851644,ko,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,170.54,2022-10-06,20th Century Girl,8.7,290
1,238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",86.518,1972-03-14,The Godfather,8.7,16988
2,278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,84.681,1994-09-23,The Shawshank Redemption,8.7,22748
3,240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,44.957,1974-12-20,The Godfather Part II,8.6,10293
4,667257,es,Cosas imposibles,"Matilde is a woman who, after the death of her...",32.859,2021-06-17,Impossible Things,8.6,299
5,19404,hi,दिलवाले दुल्हनिया ले जायेंगे,"Raj is a rich, carefree, happy-go-lucky second...",23.31,1995-10-19,Dilwale Dulhania Le Jayenge,8.6,3961
6,424,en,Schindler's List,The true story of how businessman Oskar Schind...,52.121,1993-12-15,Schindler's List,8.6,13486
7,620249,zh,罗小黑战记,"In the bustling human world, spirits live peac...",18.207,2019-08-27,The Legend of Hei,8.6,215
8,372754,ja,同級生,"Rihito Sajo, an honor student with a perfect s...",11.288,2016-02-20,Dou kyu sei – Classmates,8.5,263
9,129,ja,千と千尋の神隠し,"A young girl, Chihiro, becomes trapped in a st...",64.948,2001-07-20,Spirited Away,8.5,13595


In [21]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(10000, 27921)

In [22]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[5000:5010]

array(['coincidental', 'coincidentally', 'coincides', 'coins',
       'coinvolgere', 'coinvolgerlo', 'coke', 'col', 'cola', 'colardo'],
      dtype=object)

In [23]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
cosine_sim.shape

(10000, 10000)

In [25]:
cosine_sim[1]

array([0.        , 1.        , 0.00483686, ..., 0.01099216, 0.        ,
       0.        ])

In [26]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [27]:
indices[:10]

title
20th Century Girl              0
The Godfather                  1
The Shawshank Redemption       2
The Godfather Part II          3
Impossible Things              4
Dilwale Dulhania Le Jayenge    5
Schindler's List               6
The Legend of Hei              7
Dou kyu sei – Classmates       8
Spirited Away                  9
dtype: int64

In [29]:
# Function that takes in movie title as input and outputs most similar movies
def make_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies['title'].iloc[movie_indices], sim_scores

In [30]:
make_recommendations('The Shawshank Redemption')

(6118                   In Hell
 186               Sherlock Jr.
 4671               Escape Plan
 3124                  Brubaker
 4517                Demolition
 799                 The Chorus
 4367                  One Shot
 6479    The 40 Year Old Virgin
 728             Cool Hand Luke
 634                Toy Story 3
 Name: title, dtype: object,
 [(6118, 0.14127834256457766),
  (186, 0.12628000964650968),
  (4671, 0.12067020534709928),
  (3124, 0.12061270641622489),
  (4517, 0.11895454741554504),
  (799, 0.1139567490391571),
  (4367, 0.11210338235638342),
  (6479, 0.11187685352483144),
  (728, 0.10404630593308992),
  (634, 0.09726826201205088)])

In [31]:
make_recommendations('The Godfather')

(3        The Godfather Part II
 1660    The Godfather Part III
 7699                Blood Ties
 9759                Proud Mary
 233          The Best of Youth
 254                 On My Skin
 155                      Joker
 516                Shoplifters
 5590                       Joe
 9001                  3 Ninjas
 Name: title, dtype: object,
 [(3, 0.431266117890593),
  (1660, 0.16298687443623486),
  (7699, 0.15581651544672043),
  (9759, 0.12932637511845338),
  (233, 0.1269216718204816),
  (254, 0.10257824136574865),
  (155, 0.1015232934232453),
  (516, 0.10114142160299325),
  (5590, 0.09954459145191885),
  (9001, 0.09635894379926283)])

In [34]:
import pandas as pd
import numpy as np

movies = pd.read_csv('dataset/movies.csv')
ratings = pd.read_csv('dataset/ratings.csv')

movies_ratings = movies.merge(ratings, on='movieId', how='left')
movies_ratings.head(5)

Unnamed: 0,movieId,original_language,original_title,overview,popularity,release_date,title,vote_average,vote_count,userId,rating,timestamp
0,851644,ko,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,170.54,2022-10-06,20th Century Girl,8.7,290,,,
1,238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",86.518,1972-03-14,The Godfather,8.7,16988,27.0,3.0,962685525.0
2,238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",86.518,1972-03-14,The Godfather,8.7,16988,43.0,5.0,848994876.0
3,278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,84.681,1994-09-23,The Shawshank Redemption,8.7,22748,294.0,1.0,966596695.0
4,278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,84.681,1994-09-23,The Shawshank Redemption,8.7,22748,603.0,2.0,963178727.0


In [35]:
user_ratings = movies_ratings.pivot_table(index='userId', columns=['title'], values='rating')
user_ratings.head(5)

title,10 Things I Hate About You,"10,000 BC",12 Angry Men,1408,15 Minutes,16 Blocks,1900,2 Days in Paris,"20,000 Leagues Under the Sea",2001: A Space Odyssey,...,X-Men Origins: Wolverine,Y Tu Mamá También,Yamakasi,You Only Live Twice,Young Frankenstein,Zatoichi,Zodiac,Zombie Flesh Eaters,eXistenZ,xXx
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,5.0,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,0.5,,,,,,,,,
4.0,,,,,,,,,,,...,,1.0,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,


In [37]:
rated_movie = user_ratings['Pulp Fiction']
rated_movie.head(5)

userId
1.0   NaN
2.0   NaN
3.0   NaN
4.0   NaN
5.0   NaN
Name: Pulp Fiction, dtype: float64

In [38]:
sim_movies = user_ratings.corrwith(rated_movie)
sim_movies.dropna(inplace=True)
sim_movies = pd.DataFrame(sim_movies, columns=['similarities'])
sim_movies.head(5)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,similarities
title,Unnamed: 1_level_1
A Bridge Too Far,1.0
A River Runs Through It,-1.0
American Pie,0.560612
Barry Lyndon,1.0
Breakdown,-1.0


In [39]:
sim_movies.sort_values(by='similarities', ascending=False).head(5)

Unnamed: 0_level_0,similarities
title,Unnamed: 1_level_1
Tomorrow Never Dies,1.0
Pulp Fiction,1.0
The Thomas Crown Affair,1.0
The Talented Mr. Ripley,1.0
Galaxy Quest,1.0


In [40]:
movies_ratings['total_ratings'] = movies_ratings.groupby('movieId')['rating'].transform('count')
movies_ratings['mean_rating'] = movies_ratings.groupby('movieId')['rating'].transform('mean')
movies_ratings.head(5)

Unnamed: 0,movieId,original_language,original_title,overview,popularity,release_date,title,vote_average,vote_count,userId,rating,timestamp,total_ratings,mean_rating
0,851644,ko,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,170.54,2022-10-06,20th Century Girl,8.7,290,,,,0,
1,238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",86.518,1972-03-14,The Godfather,8.7,16988,27.0,3.0,962685525.0,2,4.0
2,238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",86.518,1972-03-14,The Godfather,8.7,16988,43.0,5.0,848994876.0,2,4.0
3,278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,84.681,1994-09-23,The Shawshank Redemption,8.7,22748,294.0,1.0,966596695.0,2,1.5
4,278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,84.681,1994-09-23,The Shawshank Redemption,8.7,22748,603.0,2.0,963178727.0,2,1.5


In [41]:
movie_stats = movies_ratings[['movieId', 'title', 'total_ratings', 'mean_rating']]
movie_stats.drop_duplicates('movieId', keep='first', inplace=True)

movie_stats.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_stats.drop_duplicates('movieId', keep='first', inplace=True)


Unnamed: 0,movieId,title,total_ratings,mean_rating
0,851644,20th Century Girl,0,
1,238,The Godfather,2,4.0
3,278,The Shawshank Redemption,2,1.5
5,240,The Godfather Part II,4,2.375
9,667257,Impossible Things,0,


In [42]:
pop_movies = movie_stats['total_ratings'] >= 50
pop_movies = movie_stats[pop_movies].sort_values(['total_ratings', 
                                                    'mean_rating'], ascending=False)
pop_movies.head()

Unnamed: 0,movieId,title,total_ratings,mean_rating
31855,318,The Million Dollar Hotel,317,4.429022
29444,296,Terminator 3: Rise of the Machines,307,4.197068
4021,593,Solaris,279,4.16129
10054,260,The 39 Steps,251,4.231076
2716,110,Three Colors: Red,237,4.031646


In [43]:
pop_movies.sort_values(by='total_ratings', ascending=True).head()

Unnamed: 0,movieId,title,total_ratings,mean_rating
24677,2105,American Pie,50,3.34
6454,910,The Big Sleep,50,4.01
8469,145,Breaking the Waves,51,3.245098
15394,2100,The Last Castle,51,3.117647
11341,1645,A Time to Kill,51,3.411765
