# Model Training

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from joblib import dump, load
from tqdm import tqdm
from CBClass import ContentBasedRecommender, genre_tokenizer

In [20]:
# Example data (replace with your actual data)
movies_metadata = pd.read_csv("../../preprocessing/movies.dat", sep='::', engine='python', names=['MovieID', 'Title', 'Genres'], encoding='latin-1')
movies_metadata['Genres'] = movies_metadata['Genres'].fillna('Unknown')
movies_metadata.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
#movies_metadata = movies_metadata.drop(columns=["AvgRating", "RatingCount"])
#movies_metadata = movies_metadata.rename(columns={"GenresText": "Genres"})

In [None]:
#movies_metadata.head()

Unnamed: 0,AgeGroup,MovieID,Title,Genres,Year
0,Under 18,2010,Metropolis,['Sci-Fi'],1926
1,Under 18,1213,GoodFellas,"['Crime', 'Drama']",1990
2,Under 18,1212,"Third Man, The","['Mystery', 'Thriller']",1949
3,Under 18,945,Top Hat,"['Comedy', 'Musical', 'Romance']",1935
4,Under 18,3435,Double Indemnity,"['Crime', 'Film-Noir']",1944


In [21]:
# Option 1: Default TF-IDF
recommender = ContentBasedRecommender()
recommender.fit(movies_metadata)

# Option 2: Customize Vectorizer (e.g., change tokenizer or weighting)
#from sklearn.feature_extraction.text import TfidfVectorizer
#custom_vectorizer = TfidfVectorizer(
#    tokenizer=lambda x: x.split(','),
#    max_features=100  # Limit vocabulary size
#)
#recommender = ContentBasedRecommender(vectorizer=custom_vectorizer)
#recommender.fit(movies_metadata)



<CBClass.ContentBasedRecommender at 0x1cd16eace90>

In [None]:
# Get similar movies for Movie ID 1
recommender.recommend(movie_id=1)

Unnamed: 0,MovieID,Title,Genres
3685,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Animation|Children's|Comedy
2286,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy
3542,3611,Saludos Amigos (1943),Animation|Children's|Comedy
2073,2142,"American Tail: Fievel Goes West, An (1991)",Animation|Children's|Comedy
3682,3751,Chicken Run (2000),Animation|Children's|Comedy


In [24]:
print(recommender.vectorizer.get_feature_names_out())  # See extracted genres

['action' 'action|adventure' 'action|adventure|animation'
 "action|adventure|animation|children's|fantasy"
 'action|adventure|animation|horror|sci-fi' "action|adventure|children's"
 "action|adventure|children's|comedy"
 "action|adventure|children's|fantasy"
 "action|adventure|children's|sci-fi" 'action|adventure|comedy'
 'action|adventure|comedy|crime' 'action|adventure|comedy|horror'
 'action|adventure|comedy|horror|sci-fi' 'action|adventure|comedy|romance'
 'action|adventure|comedy|sci-fi' 'action|adventure|comedy|war'
 'action|adventure|crime' 'action|adventure|crime|drama'
 'action|adventure|crime|thriller' 'action|adventure|drama'
 'action|adventure|drama|romance' 'action|adventure|drama|sci-fi|war'
 'action|adventure|drama|thriller' 'action|adventure|fantasy'
 'action|adventure|fantasy|sci-fi' 'action|adventure|horror'
 'action|adventure|horror|thriller' 'action|adventure|mystery'
 'action|adventure|mystery|sci-fi' 'action|adventure|romance'
 'action|adventure|romance|sci-fi|war'

In [38]:
# Save
ContentBasedRecommender.save(recommender, '../models/cb_model.joblib')

# Load later
recommender_loaded = ContentBasedRecommender.load('../models/cb_model.joblib')
recommender_loaded.recommend(movie_id=945)

Unnamed: 0,MovieID,Title,Genres
1052,1066,Shall We Dance? (1937),Comedy|Musical|Romance
1359,1380,Grease (1978),Comedy|Musical|Romance
933,945,Top Hat (1935),Comedy|Musical|Romance
1053,1067,"Damsel in Distress, A (1937)",Comedy|Musical|Romance
1043,1057,Everyone Says I Love You (1996),Comedy|Musical|Romance


# Accuracy Testing

In [26]:
ratings_df = pd.read_csv("../../preprocessing/merged_data.csv")
ratings_df = ratings_df.drop(columns=["Gender", "Age", "Year", "Title", "Genres"])


# Filter high-rated movies (ground truth for user preferences)
high_rated = ratings_df[ratings_df['Rating'] >= 4][['UserID', 'MovieID']]

In [39]:
ratings_df.to_csv("../../preprocessing/ratings.csv", index=False)

In [36]:
high_rated.head(-1)

Unnamed: 0,UserID,MovieID
0,1,1193
3,1,3408
4,1,2355
6,1,1287
7,1,2804
...,...,...
1000201,6040,1080
1000202,6040,1089
1000205,6040,1094
1000206,6040,562


In [None]:
# Parameters
k = 5  # Number of recommendations to evaluate
n_users = 1000  # Number of users to test (adjust for speed)

# Sample users with enough high-rated movies
user_counts = high_rated['UserID'].value_counts()
valid_users = user_counts[user_counts >= 2].index  # Users with at least 2 high ratings
test_users = np.random.choice(valid_users, size=min(n_users, len(valid_users)), replace=False)

# Evaluate precision@k
precisions = []
for user_id in tqdm(test_users, desc="Evaluating users"):
    # Get user's high-rated movies
    user_likes = high_rated[high_rated['UserID'] == user_id]['MovieID'].values
    
    # Pick one movie as query (randomly)
    query_movie = np.random.choice(user_likes)
    
    # Get recommendations
    try:
        recs_df = recommender.recommend(query_movie, top_n=k)
        rec_movie_ids = recs_df['MovieID'].values
        
        # Compute precision: proportion of recommended movies that are in user_likes
        relevant = len(set(rec_movie_ids) & set(user_likes))
        precision = relevant / k
        precisions.append(precision)
    except ValueError as e:
        # Skip if movie_id not found
        continue

# Average precision@k
avg_precision = np.mean(precisions) if precisions else 0
print(f"Average Precision@{k}: {avg_precision:.4f}")

Evaluating users: 100%|██████████| 1000/1000 [00:02<00:00, 441.07it/s]

Average Precision@5: 0.0860





In [31]:
# Test genre overlap for a sample of movies
test_movies = movies_metadata['MovieID'].sample(100, random_state=42)
overlap_scores = []

for movie_id in tqdm(test_movies, desc="Evaluating genre overlap"):
    try:
        query_genres = set(movies_metadata[movies_metadata['MovieID'] == movie_id]['Genres'].iloc[0].split(','))
        recs_df = recommender.recommend(movie_id, top_n=k)
        
        # Check genre overlap for each recommendation
        overlaps = 0
        for _, rec in recs_df.iterrows():
            rec_genres = set(rec['Genres'].split(','))
            if query_genres & rec_genres:  # At least one common genre
                overlaps += 1
        overlap_score = overlaps / k
        overlap_scores.append(overlap_score)
    except ValueError:
        continue

avg_overlap = np.mean(overlap_scores) if overlap_scores else 0
print(f"Average Genre Overlap@{k}: {avg_overlap:.4f}")

Evaluating genre overlap: 100%|██████████| 100/100 [00:00<00:00, 466.36it/s]

Average Genre Overlap@5: 0.9060



