---
Load data

In [2]:
import pandas as pd

# Load data
anime_df = pd.read_parquet('Data/preprocessed_anime.parquet')
ratings_df = pd.read_parquet('Data/preprocessed_ratings.parquet')

---
Split and Evaluation

In [3]:
import numpy as np

def sample_users(ratings_df, n_users=100, min_anime_rated=20, seed=42):
    np.random.seed(seed)
    counts = ratings_df.groupby('userID').size()
    pool = counts[counts >= min_anime_rated].index.values

    return np.random.choice(pool, size=n_users, replace=False)

In [4]:
from sklearn.preprocessing import normalize

def evaluate_ratings(test_df, recommendations_df, anime_df, anime_feature_matrix, anime_id_to_index, rating_threshold=7, k=10):

    recommended_ids = recommendations_df.index.tolist()[:k]
    relevant_set = set(test_df.loc[test_df['rating'] >= rating_threshold, 'idMal'].values)

    # Precision and Recall
    hits = len(set(recommended_ids) & relevant_set)
    precision = hits / k
    recall = hits / len(relevant_set) if relevant_set else 0

    # MAP (AP)
    avg_prec = 0
    hits_so_far = 0
    for i, rid in enumerate(recommended_ids, start=1):
        if rid in relevant_set:
            hits_so_far += 1
            avg_prec += hits_so_far / i
    map = avg_prec / hits if hits > 0 else 0

    # MRR
    mrr = 0
    for i, id in enumerate(recommended_ids, start=1):
        if id in relevant_set:
            mrr = 1 / i
            break

    # Diversity
    rec_indexes = [anime_id_to_index[rid] for rid in recommended_ids if rid in anime_id_to_index]
    rec_vectors = anime_feature_matrix[rec_indexes]
    rec_vectors = normalize(rec_vectors)
    if len(rec_vectors) > 1:
        sim_matrix = rec_vectors @ rec_vectors.T
        diversity = 1 - np.mean(sim_matrix[np.triu_indices(len(rec_vectors), 1)])
    else:
        diversity = 0
        
    # Novelty
    pop_values = anime_df.set_index('idMal').loc[recommended_ids, 'popularity'].values
    novelty = np.mean(1 - pop_values)

    return {
        'Precision': precision,
        'Recall': recall,
        'MAP': map,
        'MRR': mrr,
        'Diversity': diversity,
        'Novelty': novelty,
        'Relevant animes': len(relevant_set),
        'Test-set size': len(test_df)
    }

---
CBF Algorithm

In [5]:
import faiss

# Prepare columns
anime_df_ids = anime_df['idMal'].values
anime_df_titles = anime_df['title'].values

cols_prefixes = {
    'genres_': 'genres',
    'tags_': 'tags',
    'desc_': 'desc',
    'duration_': 'cat',
    'year_': 'cat',
    'format_': 'cat',
    'source_': 'cat'
}

group_cols = {group: [] for group in set(cols_prefixes.values())}

for col in anime_df.columns:
    for prefix, group in cols_prefixes.items():
        if col.startswith(prefix):
            group_cols[group].append(col)
            break

X = pd.concat([anime_df[cols] for cols in group_cols.values()], axis=1)

group_indexes = {group: [X.columns.get_loc(c) for c in cols] for group, cols in group_cols.items()}

weights = {'genres': 0.5, 'tags': 0.1,'cat': 0.4, 'desc': 0}

# Create items vectors
X_np = np.ascontiguousarray(X.values.astype('float32'))
X_weighted = np.zeros_like(X_np)

for group, ids in group_indexes.items():
    block = X_np[:, ids]
    norm = np.linalg.norm(block, axis=1, keepdims=True)
    norm[norm == 0] = 1
    X_weighted[:, ids] = (block / norm) * weights[group]

faiss.normalize_L2(X_weighted)

index = faiss.IndexFlatIP(X_np.shape[1])
index.add(X_weighted)

id_to_idx = {id: i for i, id in enumerate(anime_df_ids)}

In [6]:
from sklearn.preprocessing import MinMaxScaler

def recommend_for_user_cbf(user_train, X_weighted, id_to_idx, anime_df_ids, anime_df_titles, index):
    
    # Create user vector
    user_ratings = user_train['rating'].values

    # if no ratings
    if np.all(user_ratings == 0):
        user_ratings_norm = np.ones_like(user_ratings)
    else:
        user_ratings_norm = user_ratings / user_ratings.max()
    
    seen_idx = [id_to_idx[idm] for idm in user_train['idMal'].values if idm in id_to_idx]
    seen_vectors = X_weighted[seen_idx]
    user_profile = np.average(seen_vectors, axis=0, weights=user_ratings_norm)
    user_profile = np.expand_dims(user_profile, axis=0).astype('float32')
    faiss.normalize_L2(user_profile)

    # Similarity
    distances, indexes = index.search(user_profile, index.ntotal)
    recommended_ids = anime_df_ids[indexes.ravel()]
    recommended_scores = distances.ravel()
    recommended_titles = anime_df_titles[indexes.ravel()]

    # Filter
    seen_set = set(user_train['idMal'].values)
    recs_filtered = [(rid, score, title) for rid, score, title in zip(recommended_ids, recommended_scores, recommended_titles) if rid not in seen_set]

    # Result
    recommendations_cbf_df = pd.DataFrame(recs_filtered, columns=['idMal', 'score', 'title']).set_index('idMal')
    scaler = MinMaxScaler()
    recommendations_cbf_df['score'] = scaler.fit_transform(recommendations_cbf_df[['score']])

    return recommendations_cbf_df

---
CF Algorithm

In [None]:
# ALS
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import pickle

anime_ids = ratings_df['idMal'].unique()
anime_map = {aid: idm for idm, aid in enumerate(anime_ids)}
user_ids = ratings_df['userID'].unique()
user_map = {uid: idm for idm, uid in enumerate(user_ids)}

rows = ratings_df['userID'].map(user_map)
cols = ratings_df['idMal'].map(anime_map)
vals = ratings_df['rating']
user_item_matrix = csr_matrix((vals, (rows, cols)), shape=(len(user_map), len(anime_map)))

als_model = AlternatingLeastSquares(factors=150, regularization=0.01, iterations=10)
als_model.fit(user_item_matrix)

with open('CF/als_model.pkl', 'wb') as f:
    pickle.dump(als_model, f)
with open('CF/anime_ids.pkl', 'wb') as f:
    pickle.dump(anime_ids, f)
with open('CF/anime_map.pkl', 'wb') as f:
    pickle.dump(anime_map, f)
with open('CF/user_map.pkl', 'wb') as f:
    pickle.dump(user_map, f)

In [8]:
import pickle

with open('CF/als_model.pkl','rb') as f:
    als_model = pickle.load(f)
with open('CF/anime_ids.pkl','rb') as f:
    anime_ids = pickle.load(f)
with open('CF/anime_map.pkl','rb') as f:
    anime_map = pickle.load(f)
with open('CF/user_map.pkl','rb') as f:
    user_map = pickle.load(f)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from scipy.sparse import csr_matrix

def recommend_for_user_cf(user_train, anime_df, als_model, anime_ids, anime_map, user_id, user_map, ratings_df):
   
    # Create user vector
    user_idx = user_map[user_id]
    user_ratings = ratings_df[ratings_df['userID'] == user_id]
    rows = np.zeros(len(user_ratings))
    cols = user_ratings['idMal'].map(anime_map).values
    vals = user_ratings['rating'].values
    user_item_vector = csr_matrix((vals, (rows, cols)), shape=(1, len(anime_ids)))

    # Similarity
    recommended_indexes, recommended_scores = als_model.recommend(
        user_idx,
        user_items=user_item_vector,
        N=len(anime_ids),
        filter_already_liked_items=False
    )
    recommended_ids = [anime_ids[idx] for idx in recommended_indexes]

    # Filter
    seen_train_set = set(user_train['idMal'])
    recs_filtered = [(rid, score) for rid, score in zip(recommended_ids, recommended_scores) if rid not in seen_train_set]

    # Results
    recommendations_cf_df = pd.DataFrame(recs_filtered, columns=['idMal', 'score']).set_index('idMal')
    recommendations_cf_df = recommendations_cf_df.join(anime_df.set_index('idMal')['title'], how='left')
    scaler = MinMaxScaler()
    recommendations_cf_df['score'] = scaler.fit_transform(recommendations_cf_df[['score']])

    return recommendations_cf_df

---
Hybrid Algorithm

In [10]:
def recommend_for_user_hybrid(recs_cbf, recs_cf, anime_df, alpha=0.5, boost_pop=0.0, boost_score=0.0, k=20):

    merged = recs_cf[['score']].rename(columns={'score': 'score_cf'}).join(
        recs_cbf[['score']].rename(columns={'score': 'score_cbf'}),
        how='outer'
    ).fillna(0)

    merged['score'] = alpha * merged['score_cbf'] + (1 - alpha) * merged['score_cf']

    boost_df = anime_df.set_index('idMal')[['popularity', 'averageScore']]
    merged = merged.join(boost_df, how='left').fillna(0)
    merged['score'] += boost_pop * merged['popularity'] + boost_score * merged['averageScore']

    titles = recs_cf[['title']].combine_first(recs_cbf[['title']])
    merged = merged.join(titles, how='left')
    
    recommendations_hybrid_df = merged[['score', 'title']].sort_values('score', ascending=False).head(k)

    return recommendations_hybrid_df

---
K-Fold Validation - one User

In [11]:
import numpy as np

def evaluate_one_user_kfold_with_recs(user_id, ratings_df, als_model, user_map, anime_map, anime_ids, anime_df, X_weighted, 
                                  id_to_idx, index=None, anime_df_ids=None, anime_df_titles=None, rating_threshold=6,
                                  alpha=0.5, boost_pop=0, boost_score=0, k_folds=5, top_k=10, seed=42):

    user_ratings = ratings_df[ratings_df['userID'] == user_id].copy()
    n_ratings = len(user_ratings)

    np.random.seed(seed)
    indexes = np.random.permutation(n_ratings)

    fold_sizes = np.full(k_folds, n_ratings // k_folds, dtype=int)
    fold_sizes[:n_ratings % k_folds] += 1

    metrics_cbf = []
    metrics_cf = []
    metrics_hybrid = []
    last_recommendations = None

    current = 0
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        test_idx = indexes[start:stop]
        train_idx = np.concatenate([indexes[:start], indexes[stop:]])

        user_train = user_ratings.iloc[train_idx].reset_index(drop=True)
        user_test  = user_ratings.iloc[test_idx].reset_index(drop=True)

        recs_cbf = recommend_for_user_cbf(user_train, X_weighted, id_to_idx, anime_df_ids, anime_df_titles, index=index)
        recs_cf = recommend_for_user_cf(user_train, anime_df, als_model, anime_ids, anime_map, user_id, user_map, ratings_df)
        recs_hybrid = recommend_for_user_hybrid(recs_cbf, recs_cf, anime_df, alpha, boost_pop, boost_score, top_k)

        metrics_cbf.append(evaluate_ratings(user_test, recs_cbf, anime_df, X_weighted, id_to_idx, rating_threshold=rating_threshold, k=top_k))
        metrics_cf.append(evaluate_ratings(user_test, recs_cf, anime_df, X_weighted, id_to_idx, rating_threshold=rating_threshold, k=top_k))
        metrics_hybrid.append(evaluate_ratings(user_test, recs_hybrid, anime_df, X_weighted, id_to_idx, rating_threshold=rating_threshold, k=top_k))

        last_recommendations = recs_hybrid
        current = stop

    avg_metrics = {
        'CBF': {metric: np.mean([m[metric] for m in metrics_cbf]) for metric in metrics_cbf[0]},
        'CF': {metric: np.mean([m[metric] for m in metrics_cf]) for metric in metrics_cf[0]},
        'Hybrid': {metric: np.mean([m[metric] for m in metrics_hybrid]) for metric in metrics_hybrid[0]}
    }

    return avg_metrics, last_recommendations


---
K-Fold Validation - Many Users

In [12]:
def evaluate_multiple_users_with_kfold(user_ids, ratings_df, als_model, user_map, anime_map, anime_ids, anime_df, X_weighted, id_to_idx, 
                                       index, rating_threshold=7, alpha=0.5, boost_pop=0, boost_score=0, k_folds=5, top_k=10, seed=42):
    
    anime_df_ids = anime_df['idMal'].values
    anime_df_titles = anime_df['title'].values
    
    all_metrics_cbf = []
    all_metrics_cf = []
    all_metrics_hybrid = []
    
    for user_id in user_ids:
        avg_metrics, _ = evaluate_one_user_kfold_with_recs(user_id, ratings_df, als_model, user_map, anime_map, anime_ids, anime_df, 
                                                       X_weighted, id_to_idx, index, anime_df_ids, anime_df_titles, rating_threshold,
                                                       alpha, boost_pop, boost_score, k_folds, top_k, seed)
        
        all_metrics_cbf.append(avg_metrics['CBF'])
        all_metrics_cf.append(avg_metrics['CF'])
        all_metrics_hybrid.append(avg_metrics['Hybrid'])
    
    def average_metrics(metrics_list):
        return {metric: np.mean([m[metric] for m in metrics_list]) for metric in metrics_list[0]}
    
    global_avg_metrics = {
        'CBF': average_metrics(all_metrics_cbf),
        'CF': average_metrics(all_metrics_cf),
        'Hybrid': average_metrics(all_metrics_hybrid)
    }
    
    return global_avg_metrics

In [None]:
# Cross validation for multiple users

users = 500
min_anime_rated = 50
test_size = 0.2 
rating_threshold = 7 
alpha = 0.5
boost_pop = 0.1
boost_score = 0.1
k_folds = 5 
top_k = 10


user_ids_sample = sample_users(ratings_df, users, min_anime_rated)

global_results = evaluate_multiple_users_with_kfold(user_ids_sample, ratings_df, als_model, user_map, anime_map, anime_ids,anime_df, X_weighted,
                                                    id_to_idx, index, rating_threshold, alpha, boost_pop, boost_score, k_folds, top_k)

for model_name, metrics in global_results.items():
    print(f'\n{model_name}')
    for metric, value in metrics.items():
        print(f'{metric}: {value:.2f}')


CBF
Precision: 0.06
Recall: 0.02
MAP: 0.12
MRR: 0.13
Diversity: 0.15
Novelty: 0.38
Relevant animes: 31.59
Test-set size: 41.99

CF
Precision: 0.58
Recall: 0.31
MAP: 0.70
MRR: 0.75
Diversity: 0.50
Novelty: 0.20
Relevant animes: 31.59
Test-set size: 41.99

Hybrid
Precision: 0.52
Recall: 0.26
MAP: 0.71
MRR: 0.81
Diversity: 0.37
Novelty: 0.17
Relevant animes: 31.59
Test-set size: 41.99


---
Recommendations for User

In [15]:
user_id = 1774523

recs_hybrid = recommend_for_user_hybrid(
    recommend_for_user_cbf(ratings_df[ratings_df['userID'] == user_id], X_weighted, id_to_idx, anime_df_ids, anime_df_titles, index),
    recommend_for_user_cf(ratings_df[ratings_df['userID'] == user_id], anime_df, als_model, anime_ids, anime_map, user_id, user_map, ratings_df),
    anime_df,
    alpha=alpha,
    boost_pop=boost_pop,
    boost_score=boost_score,
    k=top_k 
)

seen_ids = set(ratings_df[ratings_df['userID'] == user_id]['idMal'])
recs_hybrid_unseen = recs_hybrid.loc[~recs_hybrid.index.isin(seen_ids)].head(top_k)

print("\nRecommendations:")
for title in recs_hybrid_unseen['title'].values:
    print(title)


Recommendations:
Kage no Jitsuryokusha ni Naritakute!
Kaguya-sama wa Kokurasetai: Ultra Romantic
Jujutsu Kaisen 2nd Season
Kage no Jitsuryokusha ni Naritakute! 2nd season
Fruits Basket: The Final
VINLAND SAGA SEASON 2
Mob Psycho 100
Mob Psycho 100 II
Kaguya-sama wa Kokurasetai: First Kiss wa Owaranai
Akatsuki no Yona
