# 1. Load data và thư viện

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import time
from functools import wraps

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv('../data/engineered_data.csv')
md = df
df.columns

Index(['belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'tagline', 'title', 'vote_average', 'vote_count',
       'cast', 'return', 'year', 'cast_size', 'crew_size', 'director',
       'is_Animation', 'is_Comedy', 'is_Family', 'is_Adventure', 'is_Fantasy',
       'is_Romance', 'is_Drama', 'is_Action', 'is_Crime', 'is_Thriller',
       'is_Horror', 'is_History', 'is_Science Fiction', 'is_Mystery', 'is_War',
       'is_Foreign', 'is_Music', 'is_Documentary', 'is_Western', 'is_TV Movie',
       'is_english', 'is_Friday', 'is_Holiday'],
      dtype='object')

# 2. HỆ THỐNG GỢI Ý ĐƠN GIẢN (SIMPLE RECOMMENDER)

Hệ thống gợi ý đơn giản dựa trên điểm đánh giá có trọng số (IMDB Weighted Rating) để xếp hạng các bộ phim phổ biến và chất lượng cao.

**Công thức:** `WR = (v/(v+m)) × R + (m/(v+m)) × C`

**Trong đó:**
- **WR**: Weighted Rating (điểm đánh giá có trọng số)
- **v**: Số phiếu bầu cho bộ phim
- **R**: Điểm trung bình của bộ phim  
- **m**: Số phiếu bầu tối thiểu để được tính điểm
- **C**: Điểm trung bình của tất cả các bộ phim

Công thức này giúp cân bằng giữa chất lượng (điểm cao) và độ phổ biến (nhiều vote).

In [49]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')

C = vote_averages.mean()

m = vote_counts.quantile(0.95)

print('Mean vote: ', C)
print('Minimum vote: ', m)

Mean vote:  5.239001265578413
Minimum vote:  425.29999999999563


In [4]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & 
               (md['vote_average'].notnull())][['title','overview', 'release_date', 
                                                'vote_count', 'vote_average', 'popularity', 'genres']]

qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2282, 7)

In [5]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)

In [6]:
qualified['wr'] = qualified.apply(weighted_rating, m=m, C=C, axis=1)

In [7]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [8]:
qualified.head(10)

Unnamed: 0,title,overview,release_date,vote_count,vote_average,popularity,genres,wr
15510,Inception,"Cobb, a skilled thief who commits corporate es...",2010-07-14,14075,8,29.108149,5,7.91778
12501,The Dark Knight,Batman raises the stakes in his war on crime. ...,2008-07-16,12269,8,123.167259,4,7.90609
22921,Interstellar,Interstellar chronicles the adventures of a gr...,2014-11-05,11187,8,32.213481,3,7.897346
2848,Fight Club,A ticking-time-bomb insomniac and a slippery s...,1999-10-15,9678,8,63.869599,1,7.882025
4870,The Lord of the Rings: The Fellowship of the Ring,"Young hobbit Frodo Baggins, after inheriting a...",2001-12-18,8892,8,32.070725,3,7.872081
292,Pulp Fiction,"A burger-loving hit man, his philosophical par...",1994-09-10,8670,8,140.950236,2,7.868961
314,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,8358,8,51.645403,2,7.864311
7011,The Lord of the Rings: The Return of the King,Aragorn is revealed as the heir to the ancient...,2003-12-01,8226,8,29.324358,3,7.862242
351,Forrest Gump,A man with a low IQ has accomplished great thi...,1994-07-06,8147,8,48.307194,3,7.860974
5824,The Lord of the Rings: The Two Towers,Frodo and Sam are trekking to Mordor to destro...,2002-12-18,7641,8,29.423537,3,7.852261


In [9]:
def build_chart(genre_column, percentile=0.85):
    df = md[md[genre_column] == 1]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & 
                   (df['vote_average'].notnull())][['title','overview', 'release_date', 'vote_count', 
                                                    'vote_average', 'popularity']]
    
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + 
                                      (m/(m+x['vote_count']) * C), axis=1)
    
    qualified = qualified.sort_values('wr', ascending=False).head(15)
    
    return qualified

In [10]:
build_chart('is_Romance').head(10)

Unnamed: 0,title,overview,release_date,vote_count,vote_average,popularity,wr
10325,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20,661,9,34.457024,8.564746
351,Forrest Gump,A man with a low IQ has accomplished great thi...,1994-07-06,8147,8,48.307194,7.971314
878,Vertigo,A retired San Francisco detective suffering fr...,1958-05-09,1162,8,18.20822,7.811404
40314,Your Name.,High schoolers Mitsuha and Taki are complete s...,2016-08-26,1030,8,34.461252,7.789196
885,Some Like It Hot,Two musicians witness a mob hit and struggle t...,1959-03-18,835,8,11.845107,7.744804
1135,Cinema Paradiso,"A filmmaker recalls his childhood, when he fel...",1988-11-17,834,8,14.177005,7.744528
19936,Paperman,An urban office worker finds that paper airpla...,2012-11-02,734,8,7.198633,7.713562
37926,Sing Street,A boy growing up in Dublin during the 1980s es...,2016-03-11,669,8,10.672862,7.689064
884,The Apartment,Bud Baxter is a minor clerk in a huge New York...,1960-06-15,498,8,11.994281,7.598792
38781,The Handmaiden,"1930s Korea, in the period of Japanese occupat...",2016-06-01,453,8,16.727405,7.565604


In [11]:
build_chart('is_Animation').head(10)

Unnamed: 0,title,overview,release_date,vote_count,vote_average,popularity,wr
359,The Lion King,A young lion cub named Simba can't wait to be ...,1994-06-23,5520,8,21.605761,7.910398
5489,Spirited Away,A ten year old girl who wanders away from her ...,2001-07-20,3968,8,41.048867,7.877358
9714,Howl's Moving Castle,"When Sophie, a shy young woman, is cursed with...",2004-11-19,2049,8,16.136048,7.774586
2889,Princess Mononoke,"Ashitaka, a prince of the disappearing Ainu tr...",1997-07-12,2041,8,17.166725,7.773796
5843,My Neighbor Totoro,Two sisters move to the country with their fat...,1988-04-16,1730,8,13.507299,7.738103
40314,Your Name.,High schoolers Mitsuha and Taki are complete s...,2016-08-26,1030,8,34.461252,7.593864
5562,Grave of the Fireflies,"In the latter part of World War II, a boy and ...",1988-04-16,974,8,0.010902,7.575145
19936,Paperman,An urban office worker finds that paper airpla...,2012-11-02,734,8,7.198633,7.470566
39450,Piper,A mother bird tries to teach her little one ho...,2016-06-16,487,8,11.243161,7.290936
20815,Wolf Children,"Hana, a nineteen-year-old college student, fal...",2012-07-21,483,8,10.249498,7.287019


In [12]:
build_chart('is_Action').head(10)

Unnamed: 0,title,overview,release_date,vote_count,vote_average,popularity,wr
15510,Inception,"Cobb, a skilled thief who commits corporate es...",2010-07-14,14075,8,29.108149,7.955096
12501,The Dark Knight,Batman raises the stakes in his war on crime. ...,2008-07-16,12269,8,123.167259,7.948607
4870,The Lord of the Rings: The Fellowship of the Ring,"Young hobbit Frodo Baggins, after inheriting a...",2001-12-18,8892,8,32.070725,7.929574
7011,The Lord of the Rings: The Return of the King,Aragorn is revealed as the heir to the ancient...,2003-12-01,8226,8,29.324358,7.924026
5824,The Lord of the Rings: The Two Towers,Frodo and Sam are trekking to Mordor to destro...,2002-12-18,7641,8,29.423537,7.918377
256,Star Wars,Princess Leia is captured and held hostage by ...,1977-05-25,6778,8,42.149697,7.908321
1157,The Empire Strikes Back,"The epic saga continues as Luke Skywalker, in ...",1980-05-17,5998,8,19.470959,7.896834
4141,Scarface,After getting a green card in exchange for ass...,1983-12-08,3017,8,11.299673,7.802031
9445,Oldboy,"With no clue how he came to be imprisoned, dru...",2003-01-01,2000,8,10.616859,7.711625
1914,Seven Samurai,A samurai answers a village's request for prot...,1954-04-26,892,8,15.01777,7.426085


# 3. Content Based Recommender

In [13]:
# Tạo subset dataframe
md_subset = md.sample(n=10000, random_state=42)
md_subset = md_subset[md_subset['id'].notnull()]['id'].astype('int')

md['id'] = md['id'].astype('int')
md = md[md['id'].isin(md_subset)]

print(f"Subset shape: {md.shape}")

Subset shape: (10052, 47)


## 3.1. Description-Based Content Filtering
Sử dụng ma trận TF-IDF để tạo vector từ nội dung mô tả phim (overview + tagline)

In [14]:
# Chuẩn bị dữ liệu text cho content-based filtering
# Kết hợp overview và tagline để tăng thông tin
md['description'] = md['overview'].fillna('') + ' ' + md['tagline'].fillna('')
md['description'] = md['description'].fillna('')

In [15]:
# Khởi tạo TF-IDF vectorizer
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(md['description'])

In [16]:
print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")

TF-IDF Matrix shape: (10052, 291890)


In [17]:

print("Computing cosine similarity matrix...")
start_time = time.time()

# Sử dụng linear_kernel cho hiệu suất tốt hơn với sparse matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

end_time = time.time()
print(f"Cosine similarity matrix computed in {end_time - start_time:.2f} seconds")
print(f"Similarity matrix shape: {cosine_sim.shape}")

Computing cosine similarity matrix...
Cosine similarity matrix computed in 0.61 seconds
Similarity matrix shape: (10052, 10052)


In [18]:
md = md.reset_index()
titles = md['title']
indices = pd.Series(md.index, index=md['title'])

print("Created title to index mapping")
print(f"Total unique movie titles: {len(indices)}")
print("\nSample mappings:")
print(indices.head())

Created title to index mapping
Total unique movie titles: 10052

Sample mappings:
title
Jumanji                        0
Father of the Bride Part II    1
Tom and Huck                   2
Dracula: Dead and Loving It    3
Nixon                          4
dtype: int64


In [19]:
# Function chính cho Content-Based Filtering
def content_based_filtering(title, cosine_sim=cosine_sim, top_n=25):
    try:
        idx = indices[title]
        
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        sim_scores = sim_scores[1:top_n+1]
        
        movie_indices = [i[0] for i in sim_scores]
        
        recommendations = md.iloc[movie_indices][['title', 'vote_average', 'vote_count', 'year', 'id', 'overview']].copy()
        recommendations['similarity_score'] = [score[1] for score in sim_scores]
        
        return recommendations
        
    except KeyError:
        print(f"Movie '{title}' not found in the dataset.")
        print("Available movies (first 10):")
        print(indices.head(10).index.tolist())
        return pd.DataFrame()

print("Content-based filtering function created successfully!")

Content-based filtering function created successfully!


In [20]:
content_based_filtering('Batman').head(10)

Unnamed: 0,title,vote_average,vote_count,year,id,overview,similarity_score
4755,Batman: Mystery of the Batwoman,6.6,87.0,2003.0,21683,"A new vigilante, Batwoman, is wreaking havoc i...",0.164201
5586,Batman vs Dracula,6.5,58.0,2005.0,20077,Gotham City is terrorized not only by recent e...,0.14107
28,Batman Forever,5.2,1529.0,1995.0,414,The Dark Knight of Gotham City confronts a das...,0.107033
3471,Batman: Under the Red Hood,7.6,459.0,2010.0,40662,Batman faces his ultimate challenge as the mys...,0.099774
9155,LEGO DC Comics Super Heroes: Justice League - ...,7.0,22.0,2016.0,396330,The caped crusader reluctantly agrees to let B...,0.090344
1758,Masterminds,4.1,13.0,1997.0,25224,Trapped in a Elite School which a gang of crim...,0.069281
302,Batman Returns,6.6,1706.0,1992.0,364,"Having defeated the Joker, Batman now faces th...",0.066773
6890,Batman v Superman: Dawn of Justice,5.7,7189.0,2016.0,209112,Fearing the actions of a god-like Super Hero l...,0.060297
9276,The Lego Batman Movie,7.2,1473.0,2017.0,324849,In the irreverent spirit of fun that made “The...,0.050473
9515,The Batman Shootings,2.5,1.0,2012.0,220890,"On July 20, 2012, a mass shooting occurred ins...",0.041445


In [21]:
content_based_filtering('The Matrix').head(10)

Movie 'The Matrix' not found in the dataset.
Available movies (first 10):
['Jumanji', 'Father of the Bride Part II', 'Tom and Huck', 'Dracula: Dead and Loving It', 'Nixon', 'Powder', 'Babe', 'Richard III', 'Mortal Kombat', 'The Usual Suspects']


## 3.2. Keyword Based Recommender
Thiết lập ma trận TF - IDF dựa trên keywords

In [22]:
md = df
keywords_df = pd.read_csv('../data/keywords.csv')
keywords_df.head(10)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
5,949,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
6,11860,"[{'id': 90, 'name': 'paris'}, {'id': 380, 'nam..."
7,45325,[]
8,9091,"[{'id': 949, 'name': 'terrorist'}, {'id': 1562..."
9,710,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam..."


In [23]:
md['id'] = md['id'].astype('int')
keywords_df['id'] = keywords_df['id'].astype('int')

In [24]:
md = md.merge(keywords_df, on='id')

In [25]:
md_subset = md.head(10000)

In [26]:
md_subset['keywords'] = md_subset['keywords'].fillna('')
md_subset['keywords'] = md_subset['keywords'].apply(lambda x: [i['name'] for i in eval(x)] 
                                                    if isinstance(x, str) else [])

md_subset['keywords_str'] = md_subset['keywords'].apply(lambda x: ' '.join(x))

In [27]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(md_subset['keywords_str'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [54]:
def key_based_recom(title, cosine_sim=cosine_sim, md=md_subset):
    try:
        idx = md[md['title'] == title].index[0]

        
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        sim_scores = sim_scores[1:11]
        
        movie_indices = [i[0] for i in sim_scores]
        
        recommendations = md.iloc[movie_indices][['title', 'vote_average', 'vote_count', 'year', 'id', 'overview']].copy()
        recommendations['similarity_score'] = [score[1] for score in sim_scores]

        return recommendations
    
    except KeyError:
        print(f"Movie '{title}' not found in the dataset.")
        print("Available movies (first 10):")
        print(indices.head(10).index.tolist())
        return pd.DataFrame()

In [55]:
key_based_recom('Batman').head(10)

Unnamed: 0,title,vote_average,vote_count,year,id,overview,similarity_score
1503,Batman & Robin,4.2,1447.0,1997.0,415,Along with crime-fighting partner Robin and ne...,0.879818
1337,Batman Returns,6.6,1706.0,1992.0,364,"Having defeated the Joker, Batman now faces th...",0.548796
3110,Batman: Mask of the Phantasm,7.4,218.0,1993.0,14919,An old flame of Bruce Wayne's strolls into tow...,0.464481
2540,Superman III,5.3,500.0,1983.0,9531,"Aiming to defeat the Man of Steel, wealthy exe...",0.422161
5074,Hero at Large,6.4,10.0,1980.0,44004,An idealistic but struggling actor finds his l...,0.391029
9272,Batman Beyond: Return of the Joker,7.5,152.0,2000.0,16234,"The Joker is back with a vengeance, and Gotham...",0.357012
9211,The Batman Superman Movie: World's Finest,7.1,53.0,1998.0,17074,Joker goes to Metropolis with an offer and pla...,0.354504
4778,The One,5.7,445.0,2001.0,10796,A sheriff's deputy fights an alternate univers...,0.332184
2538,Superman,6.9,1042.0,1978.0,1924,Mild-mannered Clark Kent works as a reporter a...,0.318164
3768,Supergirl,4.7,108.0,1984.0,9651,"After losing a powerful orb, Kara, Superman's ...",0.318017


# 4. Collaborative Filtering Recommender System

In [32]:
ratings_df = pd.read_csv('../data/ratings_small.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [33]:
reader = Reader()

data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

algo = SVD()

results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

for key, value in results.items():
    print(key, ':', value)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8998  0.8944  0.8948  0.9002  0.8944  0.8967  0.0027  
MAE (testset)     0.6938  0.6896  0.6893  0.6948  0.6862  0.6907  0.0031  
Fit time          0.58    0.62    0.58    0.74    0.61    0.63    0.06    
Test time         0.04    0.04    0.05    0.05    0.04    0.05    0.00    
test_rmse : [0.89981612 0.89435738 0.8948224  0.90016047 0.89436553]
test_mae : [0.69382893 0.68958385 0.68930899 0.69476548 0.68622844]
fit_time : (0.5791630744934082, 0.6243481636047363, 0.5834951400756836, 0.7356259822845459, 0.6116838455200195)
test_time : (0.04214596748352051, 0.04400038719177246, 0.04998779296875, 0.04851341247558594, 0.04499983787536621)


In [34]:
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e29f9c1950>

In [35]:
ratings_df[ratings_df['userId'] == 4]

Unnamed: 0,userId,movieId,rating,timestamp
147,4,10,4.0,949810645
148,4,34,5.0,949919556
149,4,112,5.0,949810582
150,4,141,5.0,949919681
151,4,153,4.0,949811346
...,...,...,...,...
346,4,3251,5.0,949918970
347,4,3255,4.0,949919738
348,4,3263,3.0,949919845
349,4,3265,5.0,949895732


In [36]:
algo.predict(1, 1371)

Prediction(uid=1, iid=1371, r_ui=None, est=2.236362354966043, details={'was_impossible': False})

In [37]:
start_time = time.time()

estimated_rating = round(algo.predict(5, 1371).est,2)

movie_title_1371 = md.loc[md['id'] == 1371, 'title'].values[0]

end_time = time.time()

processing_time_colab = end_time - start_time

print(f"The user 1 would rate the movie '{movie_title_1371}' as {estimated_rating} our of 5.0\n")
print("Processing Time:", round(processing_time_colab,3), "seconds")

The user 1 would rate the movie 'Rocky III' as 3.33 our of 5.0

Processing Time: 0.001 seconds


In [38]:
start_time = time.time()

estimated_rating = round(algo.predict(343, 631).est,2)

movie_title_1371 = md.loc[md['id'] == 631, 'title'].values[0]

end_time = time.time()

processing_time_colab = end_time - start_time

print(f"The user 343 would rate the movie '{movie_title_1371}' as {estimated_rating} our of 5.0\n")
print("Processing Time:", round(processing_time_colab,3), "seconds")

The user 343 would rate the movie 'Sunrise: A Song of Two Humans' as 3.26 our of 5.0

Processing Time: 0.0 seconds


In [39]:
start_time = time.time()

estimated_rating = round(algo.predict(1, 631).est,2)

movie_title_1371 = md.loc[md['id'] == 631, 'title'].values[0]

end_time = time.time()

processing_time_colab = end_time - start_time

print(f"The user 1 would rate the movie '{movie_title_1371}' as {estimated_rating} our of 5.0\n")
print("Processing Time:", round(processing_time_colab,3), "seconds")

The user 1 would rate the movie 'Sunrise: A Song of Two Humans' as 2.45 our of 5.0

Processing Time: 0.001 seconds


# 5. HYBRID RECOMMENDER SYSTEM

In [40]:
def content_based_filtering(title):
    """
    Perform content-based filtering to recommend similar movies based on a given movie title.

    Parameters:
        title (str): The title of the movie for which recommendations are sought.

    Returns:
        pd.DataFrame: DataFrame containing recommended movies and their relevant features.
    """
    idx = md_subset[md_subset['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    return md_subset.iloc[movie_indices][['title', 'vote_average', 'vote_count', 'year', 'id']]

In [41]:
def collaborative_filtering(userId, content_based_df):
    """
    Perform collaborative filtering to predict user ratings for movies.

    Parameters:
        userId (int): The ID of the user for whom ratings are to be predicted.
        content_based_df (pd.DataFrame): DataFrame containing movies recommended by content-based filtering.

    Returns:
        pd.DataFrame: DataFrame containing collaborative filtering estimates for recommended movies.
    """
    collab_estimates = []
    for id_ in content_based_df['id']:
        est = algo.predict(userId, id_).est
        collab_estimates.append(est)
    content_based_df['collab_est'] = collab_estimates
    return content_based_df

In [42]:
def calculate_hybrid_score(df):
    """
    Calculate the hybrid score for recommended movies using a combination of content-based and collaborative filtering.

    Parameters:
        df (pd.DataFrame): DataFrame containing recommended movies with content-based and collaborative filtering estimates.

    Returns:
        pd.DataFrame: DataFrame containing recommended movies with hybrid scores.
    """
    df['hybrid_score'] = df['vote_average'] * df['collab_est']
    return df

In [43]:
def hybrid_recommendations(userId, title):
    """
    Generate hybrid recommendations for a user based on a given movie title.

    Parameters:
        userId (int): The ID of the user for whom recommendations are to be generated.
        title (str): The title of the movie for which recommendations are sought.

    Returns:
        pd.DataFrame: DataFrame containing hybrid recommendations and their relevant features.
    """
    start_time = time.time()
    
    # Content-based filtering
    content_based_df = content_based_filtering(title)
    
    # Collaborative filtering
    collaborative_df = collaborative_filtering(userId, content_based_df)
    
    # Combine both approaches
    hybrid_df = calculate_hybrid_score(collaborative_df)
    
    # Select top recommendations
    hybrid_recommendations = hybrid_df.sort_values('hybrid_score', ascending=False).head(10)
    
    end_time = time.time()
    processing_time = end_time - start_time

    return hybrid_recommendations, processing_time

In [44]:
recommendations, processing_time_hyb = hybrid_recommendations(1, 'Superman')
print("Processing Time:", round(processing_time_hyb,3), "seconds\n")
print(recommendations[['title',"year","vote_average"]])

Processing Time: 0.005 seconds

                                          title    year  vote_average
585                                      Batman  1989.0           7.0
9272         Batman Beyond: Return of the Joker  2000.0           7.5
3110               Batman: Mask of the Phantasm  1993.0           7.4
9211  The Batman Superman Movie: World's Finest  1998.0           7.1
3884                                Unbreakable  2000.0           6.9
1337                             Batman Returns  1992.0           6.6
3686                                      X-Men  2000.0           6.8
6228                                         X2  2003.0           6.8
8594                              The Music Man  1962.0           6.7
2539                                Superman II  1980.0           6.5


In [57]:
recommendations, processing_time_hyb = hybrid_recommendations(671, 'Superman')
print("Processing Time:", round(processing_time_hyb,3), "seconds\n")
print(recommendations[['title',"year","vote_average"]])

Processing Time: 0.004 seconds

                                          title    year  vote_average
1337                             Batman Returns  1992.0           6.6
9272         Batman Beyond: Return of the Joker  2000.0           7.5
3110               Batman: Mask of the Phantasm  1993.0           7.4
585                                      Batman  1989.0           7.0
9211  The Batman Superman Movie: World's Finest  1998.0           7.1
3884                                Unbreakable  2000.0           6.9
3686                                      X-Men  2000.0           6.8
6228                                         X2  2003.0           6.8
8594                              The Music Man  1962.0           6.7
2539                                Superman II  1980.0           6.5


In [46]:
recommendations, processing_time_hyb = hybrid_recommendations(555, 'Batman')
print("Processing Time:", round(processing_time_hyb,3), "seconds\n")
print(recommendations[['title',"year","vote_average"]])

Processing Time: 0.004 seconds

                                          title    year  vote_average
9272         Batman Beyond: Return of the Joker  2000.0           7.5
3110               Batman: Mask of the Phantasm  1993.0           7.4
151                               Belle de Jour  1967.0           7.3
1337                             Batman Returns  1992.0           6.6
9211  The Batman Superman Movie: World's Finest  1998.0           7.1
2538                                   Superman  1978.0           6.9
3884                                Unbreakable  2000.0           6.9
2539                                Superman II  1980.0           6.5
6875                               The Hard Way  1991.0           6.1
8642                                     Batman  1966.0           6.1
