In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
pd.set_option('display.max_columns', 200)

### Reading Datasets

In [4]:
movies = pd.read_csv('movies.csv')
print("movies.csv")
display(movies.head(1))

reviews = pd.read_csv('reviews.csv')
print("reviews.csv")
display(reviews.head(1))

search_logs = pd.read_csv('search_logs.csv')
print("search_logs.csv")
display(search_logs.head(1))

users = pd.read_csv('users.csv')
print("users.csv")
display(users.head(1))

watch_history = pd.read_csv('watch_history.csv')
print("watch_history.csv")
display(watch_history.head(1))



movies.csv


Unnamed: 0,movie_id,title,content_type,genre_primary,genre_secondary,release_year,duration_minutes,rating,language,country_of_origin,imdb_rating,production_budget,box_office_revenue,number_of_seasons,number_of_episodes,is_netflix_original,added_to_platform,content_warning
0,movie_0001,Dragon Legend,Stand-up Comedy,History,Thriller,2014,35.0,TV-Y,French,Japan,,,,,,False,2023-08-07,False


reviews.csv


Unnamed: 0,review_id,user_id,movie_id,rating,review_date,device_type,is_verified_watch,helpful_votes,total_votes,review_text,sentiment,sentiment_score
0,review_000001,user_07066,movie_0360,4,2025-03-29,Mobile,False,3.0,5.0,Fantastic cinematography and plot twists.,positive,0.711


search_logs.csv


Unnamed: 0,search_id,user_id,search_query,search_date,results_returned,clicked_result_position,device_type,search_duration_seconds,had_typo,used_filters,location_country
0,search_000001,user_09864,classic movies,2024-03-22,20,2.0,Tablet,12.4,False,False,Canada


users.csv


Unnamed: 0,user_id,email,first_name,last_name,age,gender,country,state_province,city,subscription_plan,subscription_start_date,is_active,monthly_spend,primary_device,household_size,created_at
0,user_00001,figueroajohn@example.org,Erica,Garza,43.0,Male,USA,Massachusetts,North Jefferyhaven,Basic,2024-04-08,True,36.06,Laptop,1.0,2023-04-01 14:40:50.540242


watch_history.csv


Unnamed: 0,session_id,user_id,movie_id,watch_date,device_type,watch_duration_minutes,progress_percentage,action,quality,location_country,is_download,user_rating
0,session_000001,user_07271,movie_0511,2025-11-13,Tablet,63.9,34.6,completed,HD,USA,False,


In [5]:
display(movies.shape)
display(reviews.shape)
display(search_logs.shape)
display(users.shape)
display(watch_history.shape)

(1040, 18)

(15450, 12)

(26500, 11)

(10300, 16)

(105000, 12)

### Content based

In [6]:
#Getting what user has seen
df = watch_history.merge(movies, on = 'movie_id', how = 'left')
print(df.shape)

(109237, 29)


In [7]:
df1 = df.drop_duplicates()
print(df1.shape)

(100000, 29)


In [8]:
df1

Unnamed: 0,session_id,user_id,movie_id,watch_date,device_type,watch_duration_minutes,progress_percentage,action,quality,location_country,is_download,user_rating,title,content_type,genre_primary,genre_secondary,release_year,duration_minutes,rating,language,country_of_origin,imdb_rating,production_budget,box_office_revenue,number_of_seasons,number_of_episodes,is_netflix_original,added_to_platform,content_warning
0,session_000001,user_07271,movie_0511,2025-11-13,Tablet,63.9,34.6,completed,HD,USA,False,,Dragon Princess,Movie,Music,,2018,92.0,G,Spanish,South Korea,5.2,1893934.0,1115568.0,,,False,2022-06-04,False
1,session_000002,user_00861,movie_0588,2025-02-26,Laptop,120.1,44.2,started,HD,USA,False,,Queen Queen,Movie,Sci-Fi,,1997,122.0,TV-Y,English,India,2.5,447581.0,391734641.0,,,False,2023-09-15,False
2,session_000003,user_05391,movie_0694,2024-12-15,Desktop,572.1,84.7,started,HD,Canada,False,1.0,Kingdom Day,Movie,Action,,1996,89.0,PG,English,UK,5.9,6072218.0,37239804.0,,,False,2023-01-14,False
3,session_000004,user_05192,movie_0234,2024-09-30,Desktop,395.3,89.9,completed,SD,USA,False,5.0,An Fire,TV Series,Horror,Music,1991,58.0,TV-Y7,Japanese,South Korea,8.5,,,,178.0,False,2021-01-11,False
4,session_000005,user_05735,movie_0390,2024-08-04,Tablet,14.6,6.2,completed,HD,USA,False,,Old Night,Movie,Fantasy,Sci-Fi,2004,94.0,TV-Y7,Spanish,USA,2.7,1231347.0,2483539.0,,,False,2024-07-05,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104018,session_099996,user_09970,movie_0637,2024-08-02,Laptop,4.0,20.0,completed,4K,USA,False,,Old Battle,TV Series,Family,History,2006,58.0,TV-MA,English,USA,5.5,,,1.0,83.0,True,2021-10-03,False
104019,session_099997,user_00534,movie_0386,2024-01-15,Laptop,48.0,,paused,HD,Canada,False,3.0,Hero Hero,Stand-up Comedy,War,Adventure,2015,42.0,PG,English,USA,8.6,,,,,False,2020-10-11,False
104020,session_099998,user_08894,movie_0639,2024-07-18,Smart TV,15.3,34.3,started,SD,USA,False,,Quest Fire,Limited Series,Sport,War,1996,68.0,TV-Y7,English,Canada,4.8,,,11.0,37.0,True,2021-05-03,False
104021,session_099999,user_02303,movie_0581,2024-08-06,Desktop,32.2,40.6,completed,HD,USA,False,3.0,Ice Kingdom,Documentary,Sci-Fi,War,2016,111.0,TV-Y,English,USA,2.7,,,,,False,2023-07-19,True


### Getting Features

In [9]:
df1.language.value_counts()

language
English     58465
Spanish      9853
French       6601
Hindi        5746
Japanese     5277
Italian      5162
Korean       4877
German       4019
Name: count, dtype: int64

### Processing Text Features

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

#applying tf-idf - Term Frequency Inverse Document Frequency which basically gives more score to rare words as they are rare and more discriminative power for 
#similarity search

tf = TfidfVectorizer()

ohe = OneHotEncoder()
df1['device_type'] = df1['device_type'].fillna('')
device_type_ft = ohe.fit_transform(df1[['device_type']])

df1['quality'] = df1['quality'].fillna('')
quality_type_ft = ohe.fit_transform(df1[['quality']])

df1['genre_primary'] = df1['genre_primary'].fillna('')
genre_primary_ft = tf.fit_transform(df1['genre_primary'])

df1['genre_secondary'] = df1['genre_secondary'].fillna('')
genre_secondary_ft = tf.fit_transform(df1['genre_secondary'])

df1['language'] = df1['language'].fillna('')
language_ft = ohe.fit_transform(df1[['language']])

print(device_type_ft.shape)
print(quality_type_ft.shape)
print(genre_primary_ft.shape)
print(genre_secondary_ft.shape)
print(language_ft.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['device_type'] = df1['device_type'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['quality'] = df1['quality'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['genre_primary'] = df1['genre_primary'].fillna('')
A value is trying to be set on a copy of a slice from a 

(100000, 5)
(100000, 4)
(100000, 21)
(100000, 21)
(100000, 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['language'] = df1['language'].fillna('')


### Binary Feautures True False

In [11]:
df1['is_netflix_original'] = df1['is_netflix_original'].fillna(False)
df1['is_netflix_original_ft'] = np.where(df1['is_netflix_original'] == True, 1, 0)

df1['content_warning'] = df1['content_warning'].fillna(False)
df1['content_warning_ft'] = np.where(df1['content_warning'] == True, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['is_netflix_original'] = df1['is_netflix_original'].fillna(False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['is_netflix_original_ft'] = np.where(df1['is_netflix_original'] == True, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from 

### Processing Numeric Features

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
sc = MaxAbsScaler()


is_netflix_original_ft = sc.fit_transform(df1[['is_netflix_original_ft']])
content_warning_ft = sc.fit_transform(df1[['content_warning_ft']])

df1['duration_minutes'] = df1['duration_minutes'].fillna(np.mean(df1['duration_minutes']))
duration_minutes_ft = sc.fit_transform(df1[['duration_minutes']])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['duration_minutes'] = df1['duration_minutes'].fillna(np.mean(df1['duration_minutes']))


### Creating Feature Matrix

In [32]:
#csr for effective row slicing

#giving more weights to genres
from scipy.sparse import hstack

feature_col = hstack([
    0.5*device_type_ft,
quality_type_ft,
5*genre_primary_ft,
genre_secondary_ft,
language_ft,4*is_netflix_original_ft,1*content_warning_ft,2*duration_minutes_ft
]).tocsr()

In [33]:
feature_col.shape

(100000, 62)

### User Profile

In [34]:
def content_based_recommend(user_id): 
    #filtering for a particular user
    user_data = df1[df1['user_id'] == user_id]

    liked_movies = user_data[user_data['user_rating'] >=3]
    
    movie_ids = list(set(user_data['movie_id']))

    #create feature_matrix
    liked = liked_movies.index
    feature_matrix = feature_col[liked,:]
    
    #getting rating matrix
    rating_matrix = liked_movies['user_rating'].values

    #weighted sum
    weighted_sum = feature_matrix.T @ rating_matrix

    user_profile = weighted_sum / rating_matrix.sum()
    print('User Profile', user_profile)

    #creating 2D vector for cosine similarity
    user_vector = user_profile.reshape(1,-1)
    print('User Vector', user_vector)

    similarity = cosine_similarity(user_vector, feature_col).flatten()

    df1['Sim_Score'] = similarity

    df2 = df1.sort_values(by = ['Sim_Score'], ascending = [False])

    #removing all the movies for the the user
    related_movies = df2[~df2['movie_id'].isin(movie_ids)]

    return related_movies.drop_duplicates(subset = ['movie_id']).head()
    


In [35]:
user_id = 'user_06554'
content_based_recommend(user_id)

User Profile [0.         0.         0.13636364 0.13636364 0.22727273 0.
 1.         0.         0.         0.         2.72727273 0.
 0.         0.         0.         2.27272727 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.27272727 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.54545455 0.45454545 0.
 0.         0.         0.         0.         0.         1.09090909
 0.         0.45423518]
User Vector [[0.         0.         0.13636364 0.13636364 0.22727273 0.
  1.         0.         0.         0.         2.72727273 0.
  0.         0.         0.         2.27272727 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.27272727 0.         0.         0.     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Sim_Score'] = similarity


Unnamed: 0,session_id,user_id,movie_id,watch_date,device_type,watch_duration_minutes,progress_percentage,action,quality,location_country,is_download,user_rating,title,content_type,genre_primary,genre_secondary,release_year,duration_minutes,rating,language,country_of_origin,imdb_rating,production_budget,box_office_revenue,number_of_seasons,number_of_episodes,is_netflix_original,added_to_platform,content_warning,is_netflix_original_ft,content_warning_ft,Sim_Score
43358,session_041688,user_03213,movie_0676,2025-01-30,Tablet,35.4,49.3,paused,HD,USA,True,,Dream Hero,Movie,Adventure,Adventure,2004,174.0,TV-Y7,English,France,9.0,416301.0,268961079.0,,,True,2024-02-04,False,1,0,0.764207
10131,session_009746,user_02887,movie_0905,2025-08-09,Tablet,37.1,44.9,started,HD,USA,False,,Dark Quest,Movie,Adventure,,2020,153.0,TV-Y,English,USA,6.3,,52461237.0,,,True,2024-02-17,False,1,0,0.761939
82681,session_079465,user_00392,movie_0660,2024-06-10,Tablet,50.3,73.6,paused,HD,Canada,False,,Fire Secret,Movie,Adventure,,1990,121.0,PG-13,English,South Korea,8.6,,3300259.0,,,True,2023-12-04,False,1,0,0.760933
88949,session_085518,user_03747,movie_0018,2025-01-09,Tablet,23.9,96.3,completed,HD,Canada,False,,Princess Phoenix,Movie,Adventure,,2011,116.0,PG,English,France,7.6,10555440.0,7278095.0,,,True,2021-04-04,False,1,0,0.760756
64731,session_062215,user_05019,movie_0406,2024-12-26,Tablet,69.0,73.7,paused,HD,USA,False,,Mystery Empire,Movie,Adventure,,2015,95.0,G,English,UK,7.2,3408376.0,,,,True,2023-07-14,False,1,0,0.75996


In [36]:
def evaluate_recommender(user_id, k=10):
    """
    Evaluate content-based recommender performance for a given user.
    """
    # Get recommendations for this user
    recs = content_based_recommend(user_id).head(k)

    # Movies this user has interacted with
    user_data = df1[df1['user_id'] == user_id]

    # Movies the user actually liked (rating >= 3)
    liked_movies = set(user_data[user_data['user_rating'] >= 3]['movie_id'])

    # Movies recommended
    recommended_movies = set(recs['movie_id'])

    # True positives (correct hits)
    tp = len(liked_movies & recommended_movies)
    fp = len(recommended_movies - liked_movies)
    fn = len(liked_movies - recommended_movies)

    # Precision@k
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0

    # Recall@k
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    return {
        "user_id": user_id,
        f"precision@{k}": precision,
        f"recall@{k}": recall,
        "true_positives": tp,
        "recommended": len(recommended_movies),
        "relevant": len(liked_movies)
    }

# Example evaluation
user_id = 'user_06554'
print(evaluate_recommender(user_id, k=5))

User Profile [0.         0.         0.13636364 0.13636364 0.22727273 0.
 1.         0.         0.         0.         2.72727273 0.
 0.         0.         0.         2.27272727 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.27272727 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.54545455 0.45454545 0.
 0.         0.         0.         0.         0.         1.09090909
 0.         0.45423518]
User Vector [[0.         0.         0.13636364 0.13636364 0.22727273 0.
  1.         0.         0.         0.         2.72727273 0.
  0.         0.         0.         2.27272727 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.27272727 0.         0.         0.     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Sim_Score'] = similarity
