---
Load data

In [1]:
import pandas as pd

# Load data
anime_df = pd.read_parquet('Data/preprocessed_anime.parquet')
ratings_df = pd.read_parquet('Data/preprocessed_ratings.parquet')

---
Split and Evaluation

In [3]:
import numpy as np

def sample_users(ratings_df, n_users=100, min_anime_rated=20, seed=42):
    np.random.seed(seed)
    counts = ratings_df.groupby('userID').size()
    pool = counts[counts >= min_anime_rated].index.values

    return np.random.choice(pool, size=n_users, replace=False)

In [4]:
from sklearn.preprocessing import normalize

def evaluate_ratings(test_df, recommendations_df, anime_df, anime_feature_matrix, anime_id_to_index, rating_threshold=7, k=10):

    recommended_ids = recommendations_df.index.tolist()[:k]
    relevant_set = set(test_df.loc[test_df['rating'] >= rating_threshold, 'idMal'].values)

    # Precision and Recall
    hits = len(set(recommended_ids) & relevant_set)
    precision = hits / k
    recall = hits / len(relevant_set) if relevant_set else 0

    # MAP (AP)
    avg_prec = 0
    hits_so_far = 0
    for i, rid in enumerate(recommended_ids, start=1):
        if rid in relevant_set:
            hits_so_far += 1
            avg_prec += hits_so_far / i
    map = avg_prec / hits if hits > 0 else 0

    # MRR
    mrr = 0
    for i, id in enumerate(recommended_ids, start=1):
        if id in relevant_set:
            mrr = 1 / i
            break

    # Diversity
    rec_indexes = [anime_id_to_index[rid] for rid in recommended_ids if rid in anime_id_to_index]
    rec_vectors = anime_feature_matrix[rec_indexes]
    rec_vectors = normalize(rec_vectors)
    if len(rec_vectors) > 1:
        sim_matrix = rec_vectors @ rec_vectors.T
        diversity = 1 - np.mean(sim_matrix[np.triu_indices(len(rec_vectors), 1)])
    else:
        diversity = 0
        
    # Novelty
    pop_values = anime_df.set_index('idMal').loc[recommended_ids, 'popularity'].values
    novelty = np.mean(1 - pop_values)

    return {
        'Precision': precision,
        'Recall': recall,
        'MAP': map,
        'MRR': mrr,
        'Diversity': diversity,
        'Novelty': novelty,
        'Relevant animes': len(relevant_set),
        'Test-set size': len(test_df)
    }

---
For Diversity

In [5]:
import faiss

# Prepare columns
anime_df_ids = anime_df['idMal'].values
anime_df_titles = anime_df['title'].values

cols_prefixes = {
    'genres_': 'genres',
    'tags_': 'tags',
    'desc_': 'desc',
    'duration_': 'cat',
    'year_': 'cat',
    'format_': 'cat',
    'source_': 'cat'
}

group_cols = {group: [] for group in set(cols_prefixes.values())}

for col in anime_df.columns:
    for prefix, group in cols_prefixes.items():
        if col.startswith(prefix):
            group_cols[group].append(col)
            break

X = pd.concat([anime_df[cols] for cols in group_cols.values()], axis=1)

group_indexes = {group: [X.columns.get_loc(c) for c in cols] for group, cols in group_cols.items()}

weights = {'genres': 0.6, 'tags': 0.1, 'desc': 0.1, 'cat': 0.2}

# Create items vectors
X_np = np.ascontiguousarray(X.values.astype('float32'))
X_weighted = np.zeros_like(X_np)

for group, ids in group_indexes.items():
    block = X_np[:, ids]
    norm = np.linalg.norm(block, axis=1, keepdims=True)
    norm[norm == 0] = 1
    X_weighted[:, ids] = (block / norm) * weights[group]

faiss.normalize_L2(X_weighted)

index = faiss.IndexFlatIP(X_np.shape[1])
index.add(X_weighted)

id_to_idx = {id: i for i, id in enumerate(anime_df_ids)}

---
CF Algorithm

In [6]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler

def recommend_for_user_cf(user_train, anime_df, als_model, anime_ids, anime_map, user_id, user_map, ratings_df):
   
    # Create user vector
    user_idx = user_map[user_id]
    user_ratings = ratings_df[ratings_df['userID'] == user_id]
    rows = np.zeros(len(user_ratings))
    cols = user_ratings['idMal'].map(anime_map).values
    vals = user_ratings['rating'].values
    user_item_vector = csr_matrix((vals, (rows, cols)), shape=(1, len(anime_ids)))

    # Similarity
    recommended_indexes, recommended_scores = als_model.recommend(
        user_idx,
        user_items=user_item_vector,
        N=len(anime_ids),
        filter_already_liked_items=False
    )
    recommended_ids = [anime_ids[idx] for idx in recommended_indexes]

    # Filter
    seen_train_set = set(user_train['idMal'])
    recs_filtered = [(rid, score) for rid, score in zip(recommended_ids, recommended_scores) if rid not in seen_train_set]

    # Results
    recommendations_cf_df = pd.DataFrame(recs_filtered, columns=['idMal', 'score']).set_index('idMal')
    recommendations_cf_df = recommendations_cf_df.join(anime_df.set_index('idMal')['title'], how='left')
    scaler = MinMaxScaler()
    recommendations_cf_df['score'] = scaler.fit_transform(recommendations_cf_df[['score']])

    return recommendations_cf_df

In [None]:
import itertools
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

def tune_als(
    ratings_df,
    anime_df,
    anime_feature_matrix,
    anime_id_to_index,
    users=5,
    min_anime_rated=80,
    test_size=0.2,
    top_k=10,
    rating_threshold=7,
    param_grid=None,
    seed=42
):
    np.random.seed(seed)
    results = []

    if param_grid is None:
        param_grid = {
            "factors": [50, 100, 150],
            "regularization": [0.01, 0.1, 1],
            "iterations": [10, 20],
        }

    anime_ids = ratings_df['idMal'].unique()
    anime_map = {aid: i for i, aid in enumerate(anime_ids)}
    user_ids = ratings_df['userID'].unique()
    user_map = {uid: i for i, uid in enumerate(user_ids)}

    for factors, reg, iters in itertools.product(
        param_grid["factors"], param_grid["regularization"], param_grid["iterations"]
    ):
        rows = ratings_df['userID'].map(user_map)
        cols = ratings_df['idMal'].map(anime_map)
        vals = ratings_df['rating']
        user_item_matrix = csr_matrix((vals, (rows, cols)), shape=(len(user_map), len(anime_map)))

        als_model = AlternatingLeastSquares(factors=factors, regularization=reg, iterations=iters)
        als_model.fit(user_item_matrix)

        sampled_users = sample_users(ratings_df, n_users=users, min_anime_rated=min_anime_rated, seed=seed)

        metrics_list = []
        for u in sampled_users:
            user_ratings = ratings_df[ratings_df['userID'] == u]
            n_test = max(1, int(len(user_ratings) * test_size))
            test_df = user_ratings.sample(n=n_test, random_state=seed)
            train_df = user_ratings.drop(test_df.index)

            recs = recommend_for_user_cf(train_df, anime_df, als_model, anime_ids, anime_map, u, user_map, ratings_df)
            metrics = evaluate_ratings(test_df, recs, anime_df, anime_feature_matrix, anime_id_to_index, rating_threshold=rating_threshold, k=top_k)
            metrics_list.append(metrics)


        avg_metrics = {k: np.mean([m[k] for m in metrics_list]) for k in metrics_list[0]}
        avg_metrics.update({"factors": factors, "reg": reg, "iterations": iters})
        results.append(avg_metrics)

    return pd.DataFrame(results)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df_results = tune_als(
    ratings_df,
    anime_df,
    X_weighted,
    id_to_idx,
    users=50,
    min_anime_rated=50,
    top_k=10,
    rating_threshold=7
)

df_results.sort_values('MAP', ascending=False)

  check_blas_config()
100%|██████████| 10/10 [03:41<00:00, 22.12s/it]
100%|██████████| 20/20 [06:46<00:00, 20.31s/it]
100%|██████████| 10/10 [03:22<00:00, 20.25s/it]
100%|██████████| 20/20 [07:14<00:00, 21.72s/it]
100%|██████████| 10/10 [03:19<00:00, 19.98s/it]
100%|██████████| 20/20 [06:54<00:00, 20.73s/it]
100%|██████████| 10/10 [04:03<00:00, 24.32s/it]
100%|██████████| 20/20 [08:02<00:00, 24.10s/it]
100%|██████████| 10/10 [03:56<00:00, 23.68s/it]
100%|██████████| 20/20 [08:01<00:00, 24.07s/it]
100%|██████████| 10/10 [03:56<00:00, 23.67s/it]
100%|██████████| 20/20 [08:11<00:00, 24.57s/it]
100%|██████████| 10/10 [05:48<00:00, 34.81s/it]
100%|██████████| 20/20 [11:40<00:00, 35.03s/it]
100%|██████████| 10/10 [05:53<00:00, 35.33s/it]
100%|██████████| 20/20 [12:57<00:00, 38.89s/it]
100%|██████████| 10/10 [06:01<00:00, 36.16s/it]
100%|██████████| 20/20 [12:32<00:00, 37.62s/it]


Unnamed: 0,Precision,Recall,MAP,MRR,Diversity,Novelty,Relevant animes,Test-set size,factors,reg,iterations
12,0.598,0.264613,0.72048,0.793333,0.568534,0.199179,38.18,64.78,150,0.01,10
16,0.594,0.261301,0.717857,0.773667,0.55916,0.202648,38.18,64.78,150,1.0,10
17,0.586,0.253477,0.709801,0.768333,0.559037,0.204758,38.18,64.78,150,1.0,20
14,0.598,0.262252,0.701546,0.7615,0.568782,0.201426,38.18,64.78,150,0.1,10
13,0.6,0.266405,0.690601,0.7725,0.561126,0.202691,38.18,64.78,150,0.01,20
15,0.59,0.261198,0.689549,0.757238,0.563842,0.207937,38.18,64.78,150,0.1,20
7,0.55,0.234371,0.688105,0.773,0.553185,0.198342,38.18,64.78,100,0.01,20
1,0.5,0.205503,0.686325,0.8,0.540735,0.178577,38.18,64.78,50,0.01,20
2,0.524,0.207877,0.684956,0.770857,0.546465,0.174992,38.18,64.78,50,0.1,10
10,0.546,0.230276,0.684945,0.724524,0.550762,0.192516,38.18,64.78,100,1.0,10
