# Default Imports

In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from joblib import load
import random
from sklearn.metrics.pairwise import cosine_similarity

# Function 1 Testing

In [None]:
from hybrid_recommender_rework import get_top_n_hybrid_recommendations

In [None]:
"""
Evaluate Precision@N for the hybrid recommendation pipeline.
Args:
    cf_model: Trained Surprise SVD model.
    ratings_df: DataFrame with UserID, MovieID, Rating columns.
    genres_dict: Dict mapping MovieID to genre vector.
    n: Number of recommendations to evaluate.
    test_size: Fraction of data for test set.
    threshold: Rating threshold for relevance (e.g., 4.0).
Returns:
    float: Mean Precision@N across test users.
"""

In [None]:
genres_dict = load(r'genres_dict_small.pkl')
ratings_df = pd.read_csv(r"ml_data/ratings_1m.csv")

In [None]:
cf_model = load(r'models/cf_model.pkl')

In [None]:
n=25 
test_size=0.2 
threshold=4.0

In [None]:
# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
# Split into train and test
_, testset = train_test_split(data, test_size=test_size, random_state=42)

In [None]:
# Convert testset to DataFrame for easier handling
test_df = pd.DataFrame(testset, columns=['UserID', 'MovieID', 'Rating'])

In [None]:
# Get relevant movies (ratings >= threshold) for each test user
relevant_movies = test_df[test_df['Rating'] >= threshold].groupby('UserID')['MovieID'].apply(set).to_dict()

In [None]:
print(relevant_movies.keys())

In [None]:
test_users = list(relevant_movies.keys())

In [None]:
# Sample test users (to speed up evaluation)
if len(test_users) > 1000:  # Limit for efficiency
    test_users = random.sample(test_users, 1000)

In [None]:
precision_scores = []
for user_id in test_users:
    # Get top-N recommendations
    top_n_movies = get_top_n_hybrid_recommendations(
        user_id=user_id,
        cf_model=cf_model,
        dataset=ratings_df,  # Use full data for profile, but predictions are on trainset
        genres_dict=genres_dict,
        k=200,
        n=n
    )
    
    # Get relevant movies for this user
    user_relevant = relevant_movies.get(user_id, set())
    
    # Compute Precision@N
    relevant_in_top_n = len(set(top_n_movies) & user_relevant)
    precision = relevant_in_top_n / n if n > 0 else 0.0
    precision_scores.append(precision)

In [None]:
print(f"Mean Precision@{n}: {np.mean(precision_scores)}")

# Function 2 Validation

In [2]:
from hybrid_recommender_rework import get_top_movies_for_cluster

In [4]:
top_n = 20
min_ratings = 5

In [3]:
ratings_df = pd.read_csv(r"ml_data/ratings_1m.csv")
clusters_df = pd.read_csv(r"ml_data/user_clusters.csv")
cluster_genres_df = pd.read_csv(r"ml_data/cluster_genres.csv")
movies_df = pd.read_csv(r"ml_data/movies_1m.csv")

In [6]:
# Step 1: Merge ratings with cluster assignments
ratings_with_clusters = pd.merge(ratings_df, clusters_df, on='UserID')

In [7]:
ratings_with_clusters.head(-1)

Unnamed: 0,UserID,MovieID,Rating,ClusterID
0,1,1193,5,-1
1,1,661,3,-1
2,1,914,3,-1
3,1,3408,4,-1
4,1,2355,5,-1
...,...,...,...,...
943465,6040,1090,3,-1
943466,6040,1091,1,-1
943467,6040,1094,5,-1
943468,6040,562,5,-1


In [8]:
ratings_with_clusters["ClusterID"].value_counts()

ClusterID
-1     200334
 8     124074
 7      98172
 9      92117
 6      80081
 5      67885
 3      58309
 4      54360
 1      47611
 2      46384
 0      43889
 10     30255
Name: count, dtype: int64

In [9]:
# Step 2: Calculate average ratings and counts per movie per cluster
cluster_movie_stats = ratings_with_clusters.groupby(['ClusterID', 'MovieID']).agg(
    avg_rating=('Rating', 'mean'),
    rating_count=('Rating', 'count')
).reset_index()

In [10]:
cluster_movie_stats.head(-1)

Unnamed: 0,ClusterID,MovieID,avg_rating,rating_count
0,-1,1,4.171084,415
1,-1,2,3.254777,157
2,-1,3,2.990741,108
3,-1,4,2.888889,27
4,-1,5,3.058824,68
...,...,...,...,...
37966,10,3944,5.000000,1
37967,10,3946,1.333333,3
37968,10,3948,3.869565,23
37969,10,3949,3.444444,9


In [11]:
# Step 3: Filter movies with sufficient ratings
cluster_movie_stats = cluster_movie_stats[cluster_movie_stats['rating_count'] >= min_ratings]

In [12]:
cluster_movie_stats.head(-1)

Unnamed: 0,ClusterID,MovieID,avg_rating,rating_count
0,-1,1,4.171084,415
1,-1,2,3.254777,157
2,-1,3,2.990741,108
3,-1,4,2.888889,27
4,-1,5,3.058824,68
...,...,...,...,...
37950,10,3926,3.400000,5
37951,10,3927,3.300000,10
37952,10,3928,3.363636,11
37968,10,3948,3.869565,23


In [13]:
# Step 4: Prepare genre data for matching
movies_df['genres_list'] = movies_df['Genres'].str.split('|')
cluster_genres_df['top_genres_list'] = cluster_genres_df['Genres Ranked by Score'].str.split(',')

In [14]:
cluster_genres_df.head(-1)

Unnamed: 0,ClusterID,Genres Ranked by Score,top_genres_list
0,-1,,
1,0,"Animation, Crime, Mystery, Musical, Romance","[Animation, Crime, Mystery, Musical, Romance]"
2,1,"Mystery, Crime, Animation, Western, Musical","[Mystery, Crime, Animation, Western, Musical]"
3,2,"Animation, Crime, Musical, Western, Mystery","[Animation, Crime, Musical, Western, Mystery]"
4,3,"Animation, Musical, Crime, Western, Mystery","[Animation, Musical, Crime, Western, Mystery]"
5,4,"Western, Crime, Mystery, Animation, Romance","[Western, Crime, Mystery, Animation, Romance]"
6,5,"Animation, Crime, Musical, Romance, Western","[Animation, Crime, Musical, Romance, Western]"
7,6,"Musical, Animation, Mystery, Crime, Western","[Musical, Animation, Mystery, Crime, Western]"
8,7,"Crime, Animation, Musical, Mystery, Thriller","[Crime, Animation, Musical, Mystery, Thril..."
9,8,"Crime, Animation, Musical, Mystery, Romance","[Crime, Animation, Musical, Mystery, Romance]"


In [15]:
# Merge movie genres into cluster_movie_stats
cluster_movie_stats = pd.merge(cluster_movie_stats, movies_df[['MovieID', 'genres_list']], on='MovieID')

In [16]:
cluster_movie_stats.head(-1)

Unnamed: 0,ClusterID,MovieID,avg_rating,rating_count,genres_list
0,-1,1,4.171084,415,"[Animation, Children's, Comedy]"
1,-1,2,3.254777,157,"[Adventure, Children's, Fantasy]"
2,-1,3,2.990741,108,"[Comedy, Romance]"
3,-1,4,2.888889,27,"[Comedy, Drama]"
4,-1,5,3.058824,68,[Comedy]
...,...,...,...,...,...
27052,10,3926,3.400000,5,"[Adventure, Sci-Fi]"
27053,10,3927,3.300000,10,"[Adventure, Sci-Fi]"
27054,10,3928,3.363636,11,"[Comedy, Horror]"
27055,10,3948,3.869565,23,[Comedy]


In [17]:
# Step 5: Check genre overlap and adjust scores
def has_top_genre(row, cluster_genres):
    cluster_top_genres = cluster_genres.get(row['ClusterID'], [])
    return any(genre in cluster_top_genres for genre in row['genres_list'])

In [18]:
cluster_genres_dict = cluster_genres_df.set_index('ClusterID')['top_genres_list'].to_dict()

In [19]:
print(cluster_genres_dict)

{-1: nan, 0: ['Animation', ' Crime', ' Mystery', ' Musical', ' Romance'], 1: ['Mystery', ' Crime', ' Animation', ' Western', ' Musical'], 2: ['Animation', ' Crime', ' Musical', ' Western', ' Mystery'], 3: ['Animation', ' Musical', ' Crime', ' Western', ' Mystery'], 4: ['Western', ' Crime', ' Mystery', ' Animation', ' Romance'], 5: ['Animation', ' Crime', ' Musical', ' Romance', ' Western'], 6: ['Musical', ' Animation', ' Mystery', ' Crime', ' Western'], 7: ['Crime', ' Animation', ' Musical', ' Mystery', ' Thriller'], 8: ['Crime', ' Animation', ' Musical', ' Mystery', ' Romance'], 9: ['Western', ' Animation', ' Crime', ' Musical', ' Mystery'], 10: ['Animation', ' Musical', ' Crime', " Children's", ' Mystery']}


In [20]:
cluster_genres_dict.pop(-1, None)  # Remove the entry for ClusterID -1

nan

In [21]:
print(cluster_genres_dict)

{0: ['Animation', ' Crime', ' Mystery', ' Musical', ' Romance'], 1: ['Mystery', ' Crime', ' Animation', ' Western', ' Musical'], 2: ['Animation', ' Crime', ' Musical', ' Western', ' Mystery'], 3: ['Animation', ' Musical', ' Crime', ' Western', ' Mystery'], 4: ['Western', ' Crime', ' Mystery', ' Animation', ' Romance'], 5: ['Animation', ' Crime', ' Musical', ' Romance', ' Western'], 6: ['Musical', ' Animation', ' Mystery', ' Crime', ' Western'], 7: ['Crime', ' Animation', ' Musical', ' Mystery', ' Thriller'], 8: ['Crime', ' Animation', ' Musical', ' Mystery', ' Romance'], 9: ['Western', ' Animation', ' Crime', ' Musical', ' Mystery'], 10: ['Animation', ' Musical', ' Crime', " Children's", ' Mystery']}


In [22]:
cluster_movie_stats['matches_top_genre'] = cluster_movie_stats.apply(
    lambda row: has_top_genre(row, cluster_genres_dict), axis=1
)

In [23]:
# Boost rating by 20% if genres match
cluster_movie_stats['adjusted_score'] = cluster_movie_stats.apply(
    lambda row: row['avg_rating'] * 1.2 if row['matches_top_genre'] else row['avg_rating'], axis=1
)

In [24]:
# Step 6: Rank and select top movies per cluster
top_movies = cluster_movie_stats.groupby('ClusterID').apply(
    lambda x: x.nlargest(top_n, 'adjusted_score')[['MovieID', 'adjusted_score']]
).reset_index()

  top_movies = cluster_movie_stats.groupby('ClusterID').apply(


In [25]:
top_movies.head(-1)

Unnamed: 0,ClusterID,level_1,MovieID,adjusted_score
0,-1,2148,2905,4.733333
1,-1,1440,2019,4.606299
2,-1,2240,3030,4.606061
3,-1,509,649,4.600000
4,-1,516,665,4.600000
...,...,...,...,...
234,10,26313,2078,4.916129
235,10,25680,595,4.885714
236,10,26214,1907,4.863158
237,10,26358,2137,4.863158


In [26]:
noiseIndexes = top_movies[top_movies['ClusterID'] == -1].index
top_movies = top_movies.drop(noiseIndexes).reset_index(drop=True)

In [27]:
top_movies.head(-1)

Unnamed: 0,ClusterID,level_1,MovieID,adjusted_score
0,0,3528,1223,5.431579
1,0,3285,720,5.400000
2,0,3481,1148,5.341935
3,0,3294,745,5.250000
4,0,4525,3114,5.165217
...,...,...,...,...
214,10,26313,2078,4.916129
215,10,25680,595,4.885714
216,10,26214,1907,4.863158
217,10,26358,2137,4.863158


In [28]:
top_movies['adjusted_score'] = top_movies.adjusted_score.round(3)

In [29]:
top_movies.head(-1)

Unnamed: 0,ClusterID,level_1,MovieID,adjusted_score
0,0,3528,1223,5.432
1,0,3285,720,5.400
2,0,3481,1148,5.342
3,0,3294,745,5.250
4,0,4525,3114,5.165
...,...,...,...,...
214,10,26313,2078,4.916
215,10,25680,595,4.886
216,10,26214,1907,4.863
217,10,26358,2137,4.863


In [30]:
# Convert to dictionary format
top_movies_dict = top_movies.groupby('ClusterID').apply(
    lambda x: list(zip(x['MovieID'], x['adjusted_score']))
).to_dict()

  top_movies_dict = top_movies.groupby('ClusterID').apply(


In [31]:
print(top_movies_dict)

{0: [(1223, 5.432), (720, 5.4), (1148, 5.342), (745, 5.25), (3114, 5.165), (3000, 5.1), (1, 5.037), (2102, 4.971), (595, 4.897), (594, 4.883), (2018, 4.839), (596, 4.8), (1907, 4.8), (2137, 4.8), (2138, 4.8), (3429, 4.8), (3435, 4.783), (3089, 4.778), (3751, 4.773), (364, 4.768)], 1: [(904, 5.544), (903, 5.427), (3730, 5.378), (1284, 5.374), (1264, 5.36), (906, 5.333), (1950, 5.236), (913, 5.216), (2208, 5.2), (1252, 5.18), (1212, 5.172), (1269, 5.138), (1086, 5.0), (1617, 4.989), (123, 4.95), (924, 4.938), (950, 4.92), (800, 4.912), (911, 4.904), (931, 4.8)], 2: [(745, 5.64), (2857, 5.6), (720, 5.538), (1223, 5.52), (1148, 5.512), (2099, 5.314), (2810, 5.28), (1023, 5.236), (3000, 5.1), (1022, 5.061), (2102, 5.0), (3022, 5.0), (3114, 4.983), (1, 4.922), (741, 4.92), (2139, 4.9), (3034, 4.892), (926, 4.875), (2080, 4.832), (1192, 4.8)], 3: [(1148, 5.324), (741, 5.28), (1023, 5.28), (720, 5.187), (2761, 5.113), (3000, 5.1), (745, 5.061), (3429, 5.0), (1223, 4.982), (2857, 4.971), (594, 

In [4]:
top_movies = get_top_movies_for_cluster(
    ratings_df= ratings_df,
    clusters_df=clusters_df,
    movies_df=movies_df,
    cluster_genres_df=cluster_genres_df,
)

In [5]:
print(top_movies)

{0: [(1223, 5.432), (720, 5.4), (1148, 5.342), (745, 5.25), (3114, 5.165), (3000, 5.1), (1, 5.037), (2102, 4.971), (595, 4.897), (594, 4.883), (2018, 4.839), (596, 4.8), (1907, 4.8), (2137, 4.8), (2138, 4.8), (3429, 4.8), (3435, 4.783), (3089, 4.778), (3751, 4.773), (364, 4.768)], 1: [(904, 5.544), (903, 5.427), (3730, 5.378), (1284, 5.374), (1264, 5.36), (906, 5.333), (1950, 5.236), (913, 5.216), (2208, 5.2), (1252, 5.18), (1212, 5.172), (1269, 5.138), (1086, 5.0), (1617, 4.989), (123, 4.95), (924, 4.938), (950, 4.92), (800, 4.912), (911, 4.904), (931, 4.8)], 2: [(745, 5.64), (2857, 5.6), (720, 5.538), (1223, 5.52), (1148, 5.512), (2099, 5.314), (2810, 5.28), (1023, 5.236), (3000, 5.1), (1022, 5.061), (2102, 5.0), (3022, 5.0), (3114, 4.983), (1, 4.922), (741, 4.92), (2139, 4.9), (3034, 4.892), (926, 4.875), (2080, 4.832), (1192, 4.8)], 3: [(1148, 5.324), (741, 5.28), (1023, 5.28), (720, 5.187), (2761, 5.113), (3000, 5.1), (745, 5.061), (3429, 5.0), (1223, 4.982), (2857, 4.971), (594, 

# Function 3 Validation

In [6]:
from hybrid_recommender_rework import get_top_movies_for_cluster

ratings_df = pd.read_csv(r"ml_data/ratings_1m.csv")
clusters_df = pd.read_csv(r"ml_data/user_clusters.csv")
cluster_genres_df = pd.read_csv(r"ml_data/cluster_genres.csv")
movies_df = pd.read_csv(r"ml_data/movies_1m.csv")
cluster_details_df = pd.read_csv(r"ml_data/cluster_details.csv")

In [3]:
top_movies = get_top_movies_for_cluster(
    ratings_df= ratings_df,
    clusters_df=clusters_df,
    movies_df=movies_df,
    cluster_genres_df=cluster_genres_df,
)

In [4]:
print(top_movies)

{0: [(1223, 5.432), (720, 5.4), (1148, 5.342), (745, 5.25), (3114, 5.165), (3000, 5.1), (1, 5.037), (2102, 4.971), (595, 4.897), (594, 4.883), (2018, 4.839), (596, 4.8), (1907, 4.8), (2137, 4.8), (2138, 4.8), (3429, 4.8), (3435, 4.783), (3089, 4.778), (3751, 4.773), (364, 4.768)], 1: [(904, 5.544), (903, 5.427), (3730, 5.378), (1284, 5.374), (1264, 5.36), (906, 5.333), (1950, 5.236), (913, 5.216), (2208, 5.2), (1252, 5.18), (1212, 5.172), (1269, 5.138), (1086, 5.0), (1617, 4.989), (123, 4.95), (924, 4.938), (950, 4.92), (800, 4.912), (911, 4.904), (931, 4.8)], 2: [(745, 5.64), (2857, 5.6), (720, 5.538), (1223, 5.52), (1148, 5.512), (2099, 5.314), (2810, 5.28), (1023, 5.236), (3000, 5.1), (1022, 5.061), (2102, 5.0), (3022, 5.0), (3114, 4.983), (1, 4.922), (741, 4.92), (2139, 4.9), (3034, 4.892), (926, 4.875), (2080, 4.832), (1192, 4.8)], 3: [(1148, 5.324), (741, 5.28), (1023, 5.28), (720, 5.187), (2761, 5.113), (3000, 5.1), (745, 5.061), (3429, 5.0), (1223, 4.982), (2857, 4.971), (594, 

In [None]:
"""
Recommend movies to a new user based on their demographic information by mapping them to the most appropriate cluster.

Parameters:
- user_gender (str): Gender of the new user ('Male' or 'Female').
- user_age (int): Age of the new user.
- user_profession (str): Profession of the new user.
- clusters_df (pd.DataFrame): DataFrame with columns 'ClusterID', 'Male/Female', 'Average Age', 'Profession'.
- top_movies_dict (dict): Dictionary mapping 'ClusterID' to a list of top movies [(movie_id, score), ...].
- gender_weight (int): Weight for gender mismatch in distance calculation (default=100).
- profession_weight (int): Weight for profession mismatch in distance calculation (default=10).

Returns:
- list: List of recommended movie IDs.
"""

In [7]:
#user_gender = 'Male'
user_age = 30
#user_profession = 0
clusters_df = cluster_details_df
top_movies_dict = top_movies
gender_weight=100
profession_weight=10

In [None]:
### To do:
 # Calculate mismatch and difference columns
 #   clusters_df['gender_mismatch'] = (clusters_df['Male/Female'] != user_gender).astype(int)
 #   clusters_df['age_diff'] = abs(clusters_df['Average Age'] - user_age)
 #   clusters_df['profession_mismatch'] = (clusters_df['Profession'] != user_profession).astype(int)
    
 #   # Calculate distance with gender as the most crucial factor
 #   clusters_df['distance'] = (gender_weight * clusters_df['gender_mismatch'] +
 #                              clusters_df['age_diff'] +
 #                              profession_weight * clusters_df['profession_mismatch'])

In [None]:
clusters_df['age_diff'] = abs(clusters_df['Average Age Group in Cluster'] - user_age)

In [10]:
# Calculate distance with gender as the most crucial factor
clusters_df['distance'] = (clusters_df['age_diff'])

In [11]:
# Find the cluster with the smallest distance
best_cluster = clusters_df.loc[clusters_df['distance'].idxmin()]
best_cluster_id = best_cluster['ClusterID']

In [12]:
print(f"Best cluster for user: {best_cluster_id}")

Best cluster for user: 10


In [13]:
# Get the top movies for the best cluster
recommendations = [movie_id for movie_id, score in top_movies_dict[best_cluster_id]]

In [14]:
print(f"Best recommendations for user: {recommendations}")

Best recommendations for user: [3429, 745, 1223, 720, 1, 1148, 3114, 1023, 2761, 3745, 2096, 594, 2138, 1022, 2078, 595, 1907, 2137, 3910, 2018]


# Alternative Function 3 Validation

In [131]:
def parse_distribution(dist_str):
    """Parse distribution string into proportions (e.g., 'M:168, F:72' -> {'M': 0.7, 'F': 0.3})."""
    counts = {k.strip(): int(v) for k, v in (pair.split(':') for pair in dist_str.split(','))}
    total = sum(counts.values())
    return {k: v / total for k, v in counts.items()}

In [132]:
from hybrid_recommender_rework import get_top_movies_for_cluster

ratings_df = pd.read_csv(r"ml_data/ratings_1m.csv")
clusters_users_df = pd.read_csv(r"ml_data/user_clusters.csv")
cluster_genres_df = pd.read_csv(r"ml_data/cluster_genres.csv")
movies_df = pd.read_csv(r"ml_data/movies_1m.csv")
clusters_df = pd.read_csv(r"ml_data/cluster_details.csv")

In [133]:
user_gender = 'Male'
user_age = 31
user_profession = 'programmer'

In [134]:
top_movies = get_top_movies_for_cluster(
    ratings_df= ratings_df,
    clusters_df=clusters_users_df,
    movies_df=movies_df,
    cluster_genres_df=cluster_genres_df,
)
print(top_movies)

{0: [(1223, 5.432), (720, 5.4), (1148, 5.342), (745, 5.25), (3114, 5.165), (3000, 5.1), (1, 5.037), (2102, 4.971), (595, 4.897), (594, 4.883), (2018, 4.839), (596, 4.8), (1907, 4.8), (2137, 4.8), (2138, 4.8), (3429, 4.8), (3435, 4.783), (3089, 4.778), (3751, 4.773), (364, 4.768)], 1: [(904, 5.544), (903, 5.427), (3730, 5.378), (1284, 5.374), (1264, 5.36), (906, 5.333), (1950, 5.236), (913, 5.216), (2208, 5.2), (1252, 5.18), (1212, 5.172), (1269, 5.138), (1086, 5.0), (1617, 4.989), (123, 4.95), (924, 4.938), (950, 4.92), (800, 4.912), (911, 4.904), (931, 4.8)], 2: [(745, 5.64), (2857, 5.6), (720, 5.538), (1223, 5.52), (1148, 5.512), (2099, 5.314), (2810, 5.28), (1023, 5.236), (3000, 5.1), (1022, 5.061), (2102, 5.0), (3022, 5.0), (3114, 4.983), (1, 4.922), (741, 4.92), (2139, 4.9), (3034, 4.892), (926, 4.875), (2080, 4.832), (1192, 4.8)], 3: [(1148, 5.324), (741, 5.28), (1023, 5.28), (720, 5.187), (2761, 5.113), (3000, 5.1), (745, 5.061), (3429, 5.0), (1223, 4.982), (2857, 4.971), (594, 

In [135]:
"""
Assign a new user to a cluster and recommend movies using bias-based scoring.

Parameters:
- user_gender (str): 'Male' or 'Female'.
- user_age (int): User's age.
- user_profession (str): User's profession.
- clusters_df (pd.DataFrame): Cluster data with distribution columns.
- top_movies_dict (dict): {cluster_id: [(movie_id, score), ...]}.

Returns:
- list: Recommended movie IDs.
"""

"\nAssign a new user to a cluster and recommend movies using bias-based scoring.\n\nParameters:\n- user_gender (str): 'Male' or 'Female'.\n- user_age (int): User's age.\n- user_profession (str): User's profession.\n- clusters_df (pd.DataFrame): Cluster data with distribution columns.\n- top_movies_dict (dict): {cluster_id: [(movie_id, score), ...]}.\n\nReturns:\n- list: Recommended movie IDs.\n"

In [136]:
# Define age groups
age_groups = {
    '18-25': (18, 25),
    '26-35': (26, 35),
    '36-45': (36, 45),
    '46-55': (46, 55),
    '56+': (56, 100)
}

In [137]:
user_age_group = next(group for group, (low, high) in age_groups.items() if low <= user_age <= high)

In [138]:
# Parse distributions into proportions
clusters_df['gender_props'] = clusters_df['Male-Female Distribution'].apply(parse_distribution)
#clusters_df['age_props'] = clusters_df['Age Group Distribution'].apply(parse_distribution)
clusters_df['occupation_props'] = clusters_df['Occupation Ranking by Number'].apply(parse_distribution)

In [139]:
# Calculate biases
clusters_df['gender_bias'] = clusters_df['gender_props'].apply(
    lambda props: props.get(user_gender[0], 0) - 0.5  # M or F, baseline 50%
)
#clusters_df['age_proportion'] = clusters_df['age_props'].apply(
#    lambda props: props.get(user_age_group, 0)
#)
clusters_df['profession_bias'] = clusters_df['occupation_props'].apply(
    lambda props: props.get(user_profession, 0) - 0.2  # Baseline 20% assuming 20 professions
)

In [140]:
# Compute final score: Adjust age proportion with gender bias, add weighted profession bias
profession_weight = 0.2  # Profession has less influence
clusters_df['score'] = ((1 + clusters_df['gender_bias'])) + \
                        (profession_weight * clusters_df['profession_bias'])

In [141]:
# Select cluster with highest score
best_cluster = clusters_df.loc[clusters_df['score'].idxmax()]
best_cluster_id = best_cluster['ClusterID']

In [142]:
print(f"Best cluster for user: {best_cluster_id}")

Best cluster for user: 4


# .py Function 3 Testing

In [None]:
from hybrid_recommender_rework import recommend_for_new_user

In [137]:
from hybrid_recommender_rework import get_top_movies_for_cluster

ratings_df = pd.read_csv(r"ml_data/ratings_1m.csv")
clusters_users_df = pd.read_csv(r"ml_data/user_clusters.csv")
cluster_genres_df = pd.read_csv(r"ml_data/cluster_genres.csv")
movies_df = pd.read_csv(r"ml_data/movies_1m.csv")
clusters_df = pd.read_csv(r"ml_data/cluster_details.csv")

In [134]:
user_gender = 'Male'
user_age = 31
user_profession = 'programmer'

In [138]:
top_movies = get_top_movies_for_cluster(
    ratings_df= ratings_df,
    clusters_df=clusters_users_df,
    movies_df=movies_df,
    cluster_genres_df=cluster_genres_df,
)
print(top_movies)

{0: [(1223, 5.432), (720, 5.4), (1148, 5.342), (745, 5.25), (3114, 5.165), (3000, 5.1), (1, 5.037), (2102, 4.971), (595, 4.897), (594, 4.883), (2018, 4.839), (596, 4.8), (1907, 4.8), (2137, 4.8), (2138, 4.8), (3429, 4.8), (3435, 4.783), (3089, 4.778), (3751, 4.773), (364, 4.768)], 1: [(904, 5.544), (903, 5.427), (3730, 5.378), (1284, 5.374), (1264, 5.36), (906, 5.333), (1950, 5.236), (913, 5.216), (2208, 5.2), (1252, 5.18), (1212, 5.172), (1269, 5.138), (1086, 5.0), (1617, 4.989), (123, 4.95), (924, 4.938), (950, 4.92), (800, 4.912), (911, 4.904), (931, 4.8)], 2: [(745, 5.64), (2857, 5.6), (720, 5.538), (1223, 5.52), (1148, 5.512), (2099, 5.314), (2810, 5.28), (1023, 5.236), (3000, 5.1), (1022, 5.061), (2102, 5.0), (3022, 5.0), (3114, 4.983), (1, 4.922), (741, 4.92), (2139, 4.9), (3034, 4.892), (926, 4.875), (2080, 4.832), (1192, 4.8)], 3: [(1148, 5.324), (741, 5.28), (1023, 5.28), (720, 5.187), (2761, 5.113), (3000, 5.1), (745, 5.061), (3429, 5.0), (1223, 4.982), (2857, 4.971), (594, 

In [140]:
test_list = recommend_for_new_user(user_gender, user_age, user_profession, clusters_df, top_movies)

In [142]:
print("Test List for New User:")
for movie_id in test_list:
    title = movies_df[movies_df['MovieID'] == movie_id]['Title'].values[0]
    print(f"MovieID: {movie_id}, Title: {title}")

Test List for New User:
MovieID: 1283, Title: High Noon (1952)
MovieID: 714, Title: Dead Man (1995)
MovieID: 3030, Title: Yojimbo (1961)
MovieID: 599, Title: Wild Bunch, The (1969)
MovieID: 1209, Title: Once Upon a Time in the West (1969)
MovieID: 1201, Title: Good, The Bad and The Ugly, The (1966)
MovieID: 2951, Title: Fistful of Dollars, A (1964)
MovieID: 1266, Title: Unforgiven (1992)
MovieID: 553, Title: Tombstone (1993)
MovieID: 3365, Title: Searchers, The (1956)
MovieID: 3634, Title: Seven Days in May (1964)
MovieID: 3801, Title: Anatomy of a Murder (1959)
MovieID: 3671, Title: Blazing Saddles (1974)
MovieID: 3037, Title: Little Big Man (1970)
MovieID: 1304, Title: Butch Cassidy and the Sundance Kid (1969)
MovieID: 590, Title: Dances with Wolves (1990)
MovieID: 2921, Title: High Plains Drifter (1972)
MovieID: 2922, Title: Hang 'em High (1967)
MovieID: 3681, Title: For a Few Dollars More (1965)
MovieID: 955, Title: Bringing Up Baby (1938)


# Function 4 Validation

In [114]:
from sklearn.preprocessing import MultiLabelBinarizer

In [115]:
new_movie_genres = ['Adventure', 'Fantasy']
top_n = 10

In [116]:
# Load existing movie cluster data
clusters_df = pd.read_csv('ml_data/movie_clusters.csv')  # Columns: MovieID, cluster

# Load feature vectors for all movies (example: binary genre vectors)
features_df = pd.read_csv('ml_data/movies_1m.csv')  # Columns: MovieID, feature1, feature2, ...

In [117]:
# Merge cluster and feature data
movie_data = pd.merge(clusters_df, features_df, on='MovieID')

In [118]:
# Extract and binarize genres
mlb = MultiLabelBinarizer()
movie_data['genres_list'] = movie_data['Genres'].str.split('|')
genre_vectors = mlb.fit_transform(movie_data['genres_list'])
movie_data = pd.concat([movie_data, pd.DataFrame(genre_vectors, columns=mlb.classes_)], axis=1)

In [119]:
# Compute cluster centroids
cluster_centroids = movie_data.groupby('cluster')[mlb.classes_].mean()

In [120]:
"""
Assign a new movie to a cluster based on genre similarity to centroids.

Args:
new_movie_genres (list): List of genres (e.g., ['Action', 'Comedy'])

Returns:
int: Assigned cluster ID
"""
# Create genre vector for the new movie
new_movie_vec = mlb.transform([new_movie_genres])[0].reshape(1, -1)

# Compute similarity to each cluster centroid
similarities = cosine_similarity(new_movie_vec, cluster_centroids.values)

# Assign to the cluster with the highest similarity
assigned_cluster = cluster_centroids.index[np.argmax(similarities)]

In [121]:
"""
Find the top N similar movies within the assigned cluster.

Args:
    new_movie_genres (list): List of genres (e.g., ['Action', 'Comedy'])
    assigned_cluster (int): Cluster ID assigned to the new movie
    top_n (int): Number of similar movies to return

Returns:
    list: List of (MovieID, similarity_score) tuples
"""
# Create genre vector for the new movie
new_movie_vec = mlb.transform([new_movie_genres])[0].reshape(1, -1)

# Filter movies in the assigned cluster
cluster_movies = movie_data[movie_data['cluster'] == assigned_cluster]
movie_ids = cluster_movies['MovieID'].values
feature_vectors = cluster_movies[mlb.classes_].values

# Compute similarities
similarities = cosine_similarity(new_movie_vec, feature_vectors)[0]

# Get top N similar movies
top_indices = similarities.argsort()[-top_n:][::-1]
top_similar = [(movie_ids[i], similarities[i]) for i in top_indices]

In [122]:
print(f"Assigned Cluster: {assigned_cluster}")

Assigned Cluster: 49


In [123]:
print("Top 10 Similar Movies:")
for movie_id, similarity in top_similar:
    title = features_df[features_df['MovieID'] == movie_id]['Title'].values[0]
    print(f"MovieID: {movie_id}, Title: {title}, Similarity: {similarity:.4f}")

Top 10 Similar Movies:
MovieID: 3489, Title: Hook (1991), Similarity: 1.0000
MovieID: 2173, Title: Navigator: A Mediaeval Odyssey, The (1988), Similarity: 0.8165
MovieID: 1967, Title: Labyrinth (1986), Similarity: 0.8165
MovieID: 3807, Title: Sinbad and the Eye of the Tiger (1977), Similarity: 0.5000
MovieID: 2140, Title: Dark Crystal, The (1982), Similarity: 0.4082
MovieID: 2872, Title: Excalibur (1981), Similarity: 0.3536
MovieID: 3052, Title: Dogma (1999), Similarity: 0.0000
MovieID: 1136, Title: Monty Python and the Holy Grail (1974), Similarity: 0.0000


# Alternative Function 4 Validation

In [104]:
from sklearn.preprocessing import MultiLabelBinarizer
import faiss

In [105]:
new_movie_genres = ['Adventure', 'Fantasy']
top_n = 10

In [106]:
# Load movie data
movies_df = pd.read_csv('ml_data/movies_1m.csv')

# Load cluster assignments (assumed to exist from prior clustering)
clusters_df = pd.read_csv('ml_data/movie_clusters.csv')  # Columns: MovieID, cluster

# Merge data
movie_data = pd.merge(movies_df, clusters_df, on='MovieID')

In [107]:
# Extract and binarize genres
mlb = MultiLabelBinarizer()
movie_data['genres_list'] = movie_data['Genres'].str.split('|')
genre_vectors = mlb.fit_transform(movie_data['genres_list'])
movie_data = pd.concat([movie_data, pd.DataFrame(genre_vectors, columns=mlb.classes_)], axis=1)

In [125]:
# Compute cluster centroids
cluster_centroids = movie_data.groupby('cluster')[mlb.classes_].mean()

In [109]:
# Prepare FAISS index for ANN search
# Normalize vectors for cosine similarity (FAISS uses inner product)
genre_vectors = genre_vectors.astype(np.float32)
norms = np.linalg.norm(genre_vectors, axis=1, keepdims=True)
norms[norms == 0] = 1  # Avoid division by zero
normalized_vectors = genre_vectors / norms
index = faiss.IndexFlatIP(normalized_vectors.shape[1])  # Inner product index
index.add(normalized_vectors)  # Add vectors to index

In [126]:
"""
Assign a new movie to a cluster based on genre similarity to centroids.

Args:
    new_movie_genres (list): List of genres (e.g., ['Action', 'Comedy'])

Returns:
    int: Assigned cluster ID
"""
# Create genre vector for the new movie
new_movie_vec = mlb.transform([new_movie_genres])[0].reshape(1, -1)

# Normalize for cosine similarity
norm = np.linalg.norm(new_movie_vec)
if norm == 0:
    norm = 1
new_movie_vec_normalized = new_movie_vec / norm

# Compute similarity to each cluster centroid
similarities = np.dot(new_movie_vec_normalized, cluster_centroids.values.T)[0]

# Assign to the cluster with the highest similarity
assigned_cluster = cluster_centroids.index[np.argmax(similarities)]

In [111]:
"""
Find the top N similar movies using FAISS ANN search.

Args:
    new_movie_genres (list): List of genres (e.g., ['Action', 'Comedy'])
    top_n (int): Number of similar movies to return

Returns:
    list: List of (MovieID, similarity_score) tuples
"""
# Create genre vector for the new movie
new_movie_vec = mlb.transform([new_movie_genres])[0].astype(np.float32)

# Normalize for cosine similarity
norm = np.linalg.norm(new_movie_vec)
if norm == 0:
    norm = 1
new_movie_vec_normalized = (new_movie_vec / norm).reshape(1, -1)

# Search FAISS index
similarities, indices = index.search(new_movie_vec_normalized, top_n)

# Retrieve MovieIDs and similarities
movie_ids = movie_data['MovieID'].values
top_similar = [(movie_ids[i], similarities[0][j]) for j, i in enumerate(indices[0])]

In [127]:
print(f"Assigned Cluster: {assigned_cluster}")

Assigned Cluster: 51


In [113]:
print("Top 10 Similar Movies:")
for movie_id, similarity in top_similar:
    title = features_df[features_df['MovieID'] == movie_id]['Title'].values[0]
    print(f"MovieID: {movie_id}, Title: {title}, Similarity: {similarity:.4f}")

Top 10 Similar Movies:
MovieID: 3489, Title: Hook (1991), Similarity: 1.0000
MovieID: 2161, Title: NeverEnding Story, The (1984), Similarity: 0.8165
MovieID: 2143, Title: Legend (1985), Similarity: 0.8165
MovieID: 2043, Title: Darby O'Gill and the Little People (1959), Similarity: 0.8165
MovieID: 2005, Title: Goonies, The (1985), Similarity: 0.8165
MovieID: 1967, Title: Labyrinth (1986), Similarity: 0.8165
MovieID: 1009, Title: Escape to Witch Mountain (1975), Similarity: 0.8165
MovieID: 653, Title: Dragonheart (1996), Similarity: 0.8165
MovieID: 126, Title: NeverEnding Story III, The (1994), Similarity: 0.8165
MovieID: 60, Title: Indian in the Cupboard, The (1995), Similarity: 0.8165


# .py Function 4 Testing

In [129]:
from hybrid_recommender_rework import assign_new_movie_to_cluster, find_top_similar_movies_with_ann

In [130]:
# Example usage
new_movie_genres = ['Action', 'Adventure', 'Sci-Fi']
assigned_cluster = assign_new_movie_to_cluster(new_movie_genres)
print(f"Assigned Cluster: {assigned_cluster}")

Assigned Cluster: 21


In [131]:
top_similar_movies = find_top_similar_movies_with_ann(new_movie_genres, top_n=10)
print("Top 10 Similar Movies (ANN):")
for movie_id, similarity in top_similar_movies:
    title = movies_df[movies_df['MovieID'] == movie_id]['Title'].values[0]
    print(f"MovieID: {movie_id}, Title: {title}, Similarity: {similarity:.4f}")

Top 10 Similar Movies (ANN):
MovieID: 1375, Title: Star Trek III: The Search for Spock (1984), Similarity: 1.0000
MovieID: 1374, Title: Star Trek: The Wrath of Khan (1982), Similarity: 1.0000
MovieID: 1373, Title: Star Trek V: The Final Frontier (1989), Similarity: 1.0000
MovieID: 1372, Title: Star Trek VI: The Undiscovered Country (1991), Similarity: 1.0000
MovieID: 1371, Title: Star Trek: The Motion Picture (1979), Similarity: 1.0000
MovieID: 1356, Title: Star Trek: First Contact (1996), Similarity: 1.0000
MovieID: 480, Title: Jurassic Park (1993), Similarity: 1.0000
MovieID: 329, Title: Star Trek: Generations (1994), Similarity: 1.0000
MovieID: 316, Title: Stargate (1994), Similarity: 1.0000
MovieID: 173, Title: Judge Dredd (1995), Similarity: 1.0000
