In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from joblib import load
import random

# Function 1 Evaluation

In [None]:
from hybrid_recommender_rework import get_top_n_hybrid_recommendations

In [None]:
"""
Evaluate Precision@N for the hybrid recommendation pipeline.
Args:
    cf_model: Trained Surprise SVD model.
    ratings_df: DataFrame with UserID, MovieID, Rating columns.
    genres_dict: Dict mapping MovieID to genre vector.
    n: Number of recommendations to evaluate.
    test_size: Fraction of data for test set.
    threshold: Rating threshold for relevance (e.g., 4.0).
Returns:
    float: Mean Precision@N across test users.
"""

In [None]:
genres_dict = load(r'genres_dict_small.pkl')
ratings_df = pd.read_csv(r"ml_data/ratings_1m.csv")

In [None]:
cf_model = load(r'models/cf_model.pkl')

In [None]:
n=25 
test_size=0.2 
threshold=4.0

In [None]:
# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
# Split into train and test
_, testset = train_test_split(data, test_size=test_size, random_state=42)

In [None]:
# Convert testset to DataFrame for easier handling
test_df = pd.DataFrame(testset, columns=['UserID', 'MovieID', 'Rating'])

In [None]:
# Get relevant movies (ratings >= threshold) for each test user
relevant_movies = test_df[test_df['Rating'] >= threshold].groupby('UserID')['MovieID'].apply(set).to_dict()

In [None]:
print(relevant_movies.keys())

In [None]:
test_users = list(relevant_movies.keys())

In [None]:
# Sample test users (to speed up evaluation)
if len(test_users) > 1000:  # Limit for efficiency
    test_users = random.sample(test_users, 1000)

In [None]:
precision_scores = []
for user_id in test_users:
    # Get top-N recommendations
    top_n_movies = get_top_n_hybrid_recommendations(
        user_id=user_id,
        cf_model=cf_model,
        dataset=ratings_df,  # Use full data for profile, but predictions are on trainset
        genres_dict=genres_dict,
        k=200,
        n=n
    )
    
    # Get relevant movies for this user
    user_relevant = relevant_movies.get(user_id, set())
    
    # Compute Precision@N
    relevant_in_top_n = len(set(top_n_movies) & user_relevant)
    precision = relevant_in_top_n / n if n > 0 else 0.0
    precision_scores.append(precision)

In [None]:
print(f"Mean Precision@{n}: {np.mean(precision_scores)}")

# Function 2 Evaluation

In [2]:
from hybrid_recommender_rework import get_top_movies_for_demographic

In [3]:
top_n = 20
min_ratings = 5

In [4]:
ratings_df = pd.read_csv(r"ml_data/ratings_1m.csv")
clusters_df = pd.read_csv(r"ml_data/user_clusters.csv")
cluster_genres_df = pd.read_csv(r"ml_data/cluster_genres.csv")
movies_df = pd.read_csv(r"ml_data/movies_1m.csv")

In [5]:
# Step 1: Merge ratings with cluster assignments
ratings_with_clusters = pd.merge(ratings_df, clusters_df, on='UserID')

In [6]:
ratings_with_clusters.head(-1)

Unnamed: 0,UserID,MovieID,Rating,ClusterID
0,1,1193,5,-1
1,1,661,3,-1
2,1,914,3,-1
3,1,3408,4,-1
4,1,2355,5,-1
...,...,...,...,...
943465,6040,1090,3,-1
943466,6040,1091,1,-1
943467,6040,1094,5,-1
943468,6040,562,5,-1


In [7]:
ratings_with_clusters["ClusterID"].value_counts()

ClusterID
-1     200334
 8     124074
 7      98172
 9      92117
 6      80081
 5      67885
 3      58309
 4      54360
 1      47611
 2      46384
 0      43889
 10     30255
Name: count, dtype: int64

In [8]:
# Step 2: Calculate average ratings and counts per movie per cluster
cluster_movie_stats = ratings_with_clusters.groupby(['ClusterID', 'MovieID']).agg(
    avg_rating=('Rating', 'mean'),
    rating_count=('Rating', 'count')
).reset_index()

In [9]:
cluster_movie_stats.head(-1)

Unnamed: 0,ClusterID,MovieID,avg_rating,rating_count
0,-1,1,4.171084,415
1,-1,2,3.254777,157
2,-1,3,2.990741,108
3,-1,4,2.888889,27
4,-1,5,3.058824,68
...,...,...,...,...
37966,10,3944,5.000000,1
37967,10,3946,1.333333,3
37968,10,3948,3.869565,23
37969,10,3949,3.444444,9


In [10]:
# Step 3: Filter movies with sufficient ratings
cluster_movie_stats = cluster_movie_stats[cluster_movie_stats['rating_count'] >= min_ratings]

In [11]:
cluster_movie_stats.head(-1)

Unnamed: 0,ClusterID,MovieID,avg_rating,rating_count
0,-1,1,4.171084,415
1,-1,2,3.254777,157
2,-1,3,2.990741,108
3,-1,4,2.888889,27
4,-1,5,3.058824,68
...,...,...,...,...
37950,10,3926,3.400000,5
37951,10,3927,3.300000,10
37952,10,3928,3.363636,11
37968,10,3948,3.869565,23


In [12]:
# Step 4: Prepare genre data for matching
movies_df['genres_list'] = movies_df['Genres'].str.split('|')
cluster_genres_df['top_genres_list'] = cluster_genres_df['Genres Ranked by Score'].str.split(',')

In [13]:
cluster_genres_df.head(-1)

Unnamed: 0,ClusterID,Genres Ranked by Score,top_genres_list
0,-1,,
1,0,"Animation, Crime, Mystery, Musical, Romance","[Animation, Crime, Mystery, Musical, Romance]"
2,1,"Mystery, Crime, Animation, Western, Musical","[Mystery, Crime, Animation, Western, Musical]"
3,2,"Animation, Crime, Musical, Western, Mystery","[Animation, Crime, Musical, Western, Mystery]"
4,3,"Animation, Musical, Crime, Western, Mystery","[Animation, Musical, Crime, Western, Mystery]"
5,4,"Western, Crime, Mystery, Animation, Romance","[Western, Crime, Mystery, Animation, Romance]"
6,5,"Animation, Crime, Musical, Romance, Western","[Animation, Crime, Musical, Romance, Western]"
7,6,"Musical, Animation, Mystery, Crime, Western","[Musical, Animation, Mystery, Crime, Western]"
8,7,"Crime, Animation, Musical, Mystery, Thriller","[Crime, Animation, Musical, Mystery, Thril..."
9,8,"Crime, Animation, Musical, Mystery, Romance","[Crime, Animation, Musical, Mystery, Romance]"


In [14]:
# Merge movie genres into cluster_movie_stats
cluster_movie_stats = pd.merge(cluster_movie_stats, movies_df[['MovieID', 'genres_list']], on='MovieID')

In [15]:
cluster_movie_stats.head(-1)

Unnamed: 0,ClusterID,MovieID,avg_rating,rating_count,genres_list
0,-1,1,4.171084,415,"[Animation, Children's, Comedy]"
1,-1,2,3.254777,157,"[Adventure, Children's, Fantasy]"
2,-1,3,2.990741,108,"[Comedy, Romance]"
3,-1,4,2.888889,27,"[Comedy, Drama]"
4,-1,5,3.058824,68,[Comedy]
...,...,...,...,...,...
27052,10,3926,3.400000,5,"[Adventure, Sci-Fi]"
27053,10,3927,3.300000,10,"[Adventure, Sci-Fi]"
27054,10,3928,3.363636,11,"[Comedy, Horror]"
27055,10,3948,3.869565,23,[Comedy]


In [16]:
# Step 5: Check genre overlap and adjust scores
def has_top_genre(row, cluster_genres):
    cluster_top_genres = cluster_genres.get(row['ClusterID'], [])
    return any(genre in cluster_top_genres for genre in row['genres_list'])

In [17]:
cluster_genres_dict = cluster_genres_df.set_index('ClusterID')['top_genres_list'].to_dict()

In [18]:
print(cluster_genres_dict)

{-1: nan, 0: ['Animation', ' Crime', ' Mystery', ' Musical', ' Romance'], 1: ['Mystery', ' Crime', ' Animation', ' Western', ' Musical'], 2: ['Animation', ' Crime', ' Musical', ' Western', ' Mystery'], 3: ['Animation', ' Musical', ' Crime', ' Western', ' Mystery'], 4: ['Western', ' Crime', ' Mystery', ' Animation', ' Romance'], 5: ['Animation', ' Crime', ' Musical', ' Romance', ' Western'], 6: ['Musical', ' Animation', ' Mystery', ' Crime', ' Western'], 7: ['Crime', ' Animation', ' Musical', ' Mystery', ' Thriller'], 8: ['Crime', ' Animation', ' Musical', ' Mystery', ' Romance'], 9: ['Western', ' Animation', ' Crime', ' Musical', ' Mystery'], 10: ['Animation', ' Musical', ' Crime', " Children's", ' Mystery']}


In [19]:
cluster_genres_dict.pop(-1, None)  # Remove the entry for ClusterID -1

nan

In [20]:
print(cluster_genres_dict)

{0: ['Animation', ' Crime', ' Mystery', ' Musical', ' Romance'], 1: ['Mystery', ' Crime', ' Animation', ' Western', ' Musical'], 2: ['Animation', ' Crime', ' Musical', ' Western', ' Mystery'], 3: ['Animation', ' Musical', ' Crime', ' Western', ' Mystery'], 4: ['Western', ' Crime', ' Mystery', ' Animation', ' Romance'], 5: ['Animation', ' Crime', ' Musical', ' Romance', ' Western'], 6: ['Musical', ' Animation', ' Mystery', ' Crime', ' Western'], 7: ['Crime', ' Animation', ' Musical', ' Mystery', ' Thriller'], 8: ['Crime', ' Animation', ' Musical', ' Mystery', ' Romance'], 9: ['Western', ' Animation', ' Crime', ' Musical', ' Mystery'], 10: ['Animation', ' Musical', ' Crime', " Children's", ' Mystery']}


In [21]:
cluster_movie_stats['matches_top_genre'] = cluster_movie_stats.apply(
    lambda row: has_top_genre(row, cluster_genres_dict), axis=1
)

In [22]:
# Boost rating by 20% if genres match
cluster_movie_stats['adjusted_score'] = cluster_movie_stats.apply(
    lambda row: row['avg_rating'] * 1.2 if row['matches_top_genre'] else row['avg_rating'], axis=1
)

In [23]:
# Step 6: Rank and select top movies per cluster
top_movies = cluster_movie_stats.groupby('ClusterID').apply(
    lambda x: x.nlargest(top_n, 'adjusted_score')[['MovieID', 'adjusted_score']]
).reset_index()

  top_movies = cluster_movie_stats.groupby('ClusterID').apply(


In [24]:
top_movies.head(-1)

Unnamed: 0,ClusterID,level_1,MovieID,adjusted_score
0,-1,2148,2905,4.733333
1,-1,1440,2019,4.606299
2,-1,2240,3030,4.606061
3,-1,509,649,4.600000
4,-1,516,665,4.600000
...,...,...,...,...
234,10,26313,2078,4.916129
235,10,25680,595,4.885714
236,10,26214,1907,4.863158
237,10,26358,2137,4.863158


In [25]:
noiseIndexes = top_movies[top_movies['ClusterID'] == -1].index
top_movies = top_movies.drop(noiseIndexes).reset_index(drop=True)

In [26]:
top_movies.head(-1)

Unnamed: 0,ClusterID,level_1,MovieID,adjusted_score
0,0,3528,1223,5.431579
1,0,3285,720,5.400000
2,0,3481,1148,5.341935
3,0,3294,745,5.250000
4,0,4525,3114,5.165217
...,...,...,...,...
214,10,26313,2078,4.916129
215,10,25680,595,4.885714
216,10,26214,1907,4.863158
217,10,26358,2137,4.863158


In [27]:
top_movies['adjusted_score'] = top_movies.adjusted_score.round(3)

In [28]:
top_movies.head(-1)

Unnamed: 0,ClusterID,level_1,MovieID,adjusted_score
0,0,3528,1223,5.432
1,0,3285,720,5.400
2,0,3481,1148,5.342
3,0,3294,745,5.250
4,0,4525,3114,5.165
...,...,...,...,...
214,10,26313,2078,4.916
215,10,25680,595,4.886
216,10,26214,1907,4.863
217,10,26358,2137,4.863


In [29]:
# Convert to dictionary format
top_movies_dict = top_movies.groupby('ClusterID').apply(
    lambda x: list(zip(x['MovieID'], x['adjusted_score']))
).to_dict()

  top_movies_dict = top_movies.groupby('ClusterID').apply(


In [30]:
print(top_movies_dict)

{0: [(1223, 5.432), (720, 5.4), (1148, 5.342), (745, 5.25), (3114, 5.165), (3000, 5.1), (1, 5.037), (2102, 4.971), (595, 4.897), (594, 4.883), (2018, 4.839), (596, 4.8), (1907, 4.8), (2137, 4.8), (2138, 4.8), (3429, 4.8), (3435, 4.783), (3089, 4.778), (3751, 4.773), (364, 4.768)], 1: [(904, 5.544), (903, 5.427), (3730, 5.378), (1284, 5.374), (1264, 5.36), (906, 5.333), (1950, 5.236), (913, 5.216), (2208, 5.2), (1252, 5.18), (1212, 5.172), (1269, 5.138), (1086, 5.0), (1617, 4.989), (123, 4.95), (924, 4.938), (950, 4.92), (800, 4.912), (911, 4.904), (931, 4.8)], 2: [(745, 5.64), (2857, 5.6), (720, 5.538), (1223, 5.52), (1148, 5.512), (2099, 5.314), (2810, 5.28), (1023, 5.236), (3000, 5.1), (1022, 5.061), (2102, 5.0), (3022, 5.0), (3114, 4.983), (1, 4.922), (741, 4.92), (2139, 4.9), (3034, 4.892), (926, 4.875), (2080, 4.832), (1192, 4.8)], 3: [(1148, 5.324), (741, 5.28), (1023, 5.28), (720, 5.187), (2761, 5.113), (3000, 5.1), (745, 5.061), (3429, 5.0), (1223, 4.982), (2857, 4.971), (594, 

In [31]:
#top_movies = get_top_movies_for_demographic(
#    ratings_df= ratings_df,
#    clusters_df=clusters_df,
#    movies_df=movies_df,
#    cluster_genres_df=cluster_genres_df,
#)