# Local C.F. with cold Start

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, precision_score, recall_score
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [2]:
# load the dataset
ratings_df = pd.read_csv('../datasets/ml-100k/u.data', sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings_df.drop(columns='timestamp', inplace=True)

# Conta il numero di rating per ogni utente
user_rating_counts = ratings_df['user_id'].value_counts()

# Tieni solo gli utenti con almeno 20 valutazioni
active_users = user_rating_counts[user_rating_counts >= 20].index

# Filtra il dataset originale
filtered_df = ratings_df[ratings_df['user_id'].isin(active_users)]
print(f"Numero di utenti attivi (almeno 20 valutazioni): {len(active_users)} (total: {len(ratings_df['user_id'].unique())})")
ratings_df.head()

Numero di utenti attivi (almeno 20 valutazioni): 943 (total: 943)


Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
users_df = pd.read_csv('../datasets/ml-100k/u.user', sep='|', header=None, names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
all_in = users_df['user_id'].isin(ratings_df['user_id']).all()
print("Tutti users presenti in rating_df e users_df:", all_in)
print("Utenti:", len(users_df), "Rating:", len(ratings_df))

Tutti users presenti in rating_df e users_df: True
Utenti: 943 Rating: 100000


In [4]:
# Split
ratings_train, ratings_test = train_test_split(ratings_df, test_size=0.3, random_state=42)
ratings_train = pd.DataFrame(ratings_train, columns=['user_id', 'movie_id', 'rating'])
ratings_test = pd.DataFrame(ratings_test, columns=['user_id', 'movie_id', 'rating'])

In [5]:
# Scegli 5 utenti attivi da usare come cold start
np.random.seed(42)  # per riproducibilità
cold_start_users = np.random.choice(active_users, size=15, replace=False)
print(f"Utenti cold start selezionati: {cold_start_users}")

# Separa le valutazioni dei cold start users
ratings_cold_users = filtered_df[filtered_df['user_id'].isin(cold_start_users)]

# Resto del dataset (senza i cold start users)
rest_df = filtered_df[~filtered_df['user_id'].isin(cold_start_users)]

# Split 70/30 sul resto del dataset
rest_train, rest_test = train_test_split(rest_df, test_size=0.3, random_state=42)

# Costruzione dei set finali
ratings_train = rest_train.copy()
ratings_test = pd.concat([rest_test, ratings_cold_users], ignore_index=True)

# Controlli
print("Utenti nel training:", ratings_train['user_id'].nunique())
print("Utenti nel test:", ratings_test['user_id'].nunique())
print("Cold start users nel training?", any(u in ratings_train['user_id'].values for u in cold_start_users))
print("Cold start users nel test:", all(u in ratings_test['user_id'].values for u in cold_start_users))


Utenti cold start selezionati: [932 634 598 435 758 763 929  72 938 921 610 329 381 729 282]
Utenti nel training: 928
Utenti nel test: 943
Cold start users nel training? False
Cold start users nel test: True


In [6]:
rating_matrix = ratings_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1674,1675,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [7]:
# 2. Prepara “sentences” per Word2Vec includendo anche zip_code
sentences = users_df[['gender', 'occupation', 'zip_code']].astype(str).values.tolist()

# 3. Allena il modello Word2Vec
w2v = Word2Vec(
    sentences,
    vector_size=8,    # dimensione del vettore di embedding
    window=2,
    min_count=1,
    epochs=100,
    seed=42
)

# 4. Definisci funzione per calcolare embedding medio per riga
def embed_row(row):
    vecs = [w2v.wv[row['gender']], 
            w2v.wv[row['occupation']], 
            w2v.wv[row['zip_code']]]
    # media vettoriale
    return sum(vecs) / len(vecs)

# 5. Applica embedding a tutto il DataFrame
embeddings = users_df.apply(embed_row, axis=1)
df_emb = pd.DataFrame(
    embeddings.tolist(),
    columns=[f'emb_{i}' for i in range(w2v.vector_size)]
)
# Aggiungiamo eventualmente altre feature numeriche (es. age)
df_emb['age'] = users_df['age']

# 6. Clustering sui vettori di embedding
kmeans = KMeans(n_clusters=4, random_state=42)
users_df['cluster'] = kmeans.fit_predict(df_emb)

In [8]:
# Matrice di similarita per utenti dello stesso cluster

def create_similarity_matrix_local(rating_matrix, users_df, cluster_label, similarity='cosine'):

    # Estrai gli ID utenti nel cluster
    cluster_user_ids = users_df.loc[users_df['cluster'] == cluster_label, 'user_id']
    
    # Sottocampiona la matrice di rating solo per quegli utenti
    R_cluster = rating_matrix.loc[rating_matrix.index.intersection(cluster_user_ids)]

    # Centra i voti rispetto alla media utente
    R_centered = R_cluster.sub(R_cluster.mean(axis=1), axis=0).fillna(0)

    # Calcola la similarità
    if similarity == "pearson":
        similarity_matrix = 1 - pairwise_distances(R_centered, metric="correlation")
    elif similarity == "cosine":
        similarity_matrix = cosine_similarity(R_centered)
    else:
        raise ValueError("Unsupported similarity metric. Use 'pearson' or 'cosine'.")

    return pd.DataFrame(similarity_matrix, index=R_cluster.index, columns=R_cluster.index)


def predict_cluster_ratings(rating_matrix, users_df, cluster_label, similarity='cosine'):

    # 1. Estrai la sotto-matrice di rating e la matrice di similarità locale
    cluster_user_ids = users_df.loc[users_df['cluster'] == cluster_label, 'user_id']
    R_cluster = rating_matrix.loc[rating_matrix.index.intersection(cluster_user_ids)]

    if R_cluster.shape[0] < 2:
        return pd.DataFrame(index=cluster_user_ids, columns=rating_matrix.columns)

    # 2. Crea la matrice di similarità
    similarity_matrix = create_similarity_matrix_local(rating_matrix, users_df, cluster_label, similarity)

    # 3. Predizioni user-based sul cluster
    user_means = R_cluster.replace(0, np.nan).mean(axis=1)
    user_centered = R_cluster.sub(user_means, axis=0).fillna(0)

    numerator_user = similarity_matrix.dot(user_centered)
    denominator_user = np.abs(similarity_matrix).sum(axis=1)

    prediction_matrix = numerator_user.div(denominator_user, axis=0).add(user_means, axis=0)
    prediction_matrix = prediction_matrix.map(lambda x: 0 if x < 0.5 else 5 if x >= 5.5 else math.floor(x + 0.5))

    return prediction_matrix


In [9]:
all_predictions = []

for label in users_df['cluster'].unique():
    pred = predict_cluster_ratings(rating_matrix, users_df, cluster_label=label, similarity='cosine')
    print(pred.head())
    all_predictions.append(pred)

# Ricostruzione della matrice predizioni globale
final_prediction_matrix = pd.concat(all_predictions)
final_prediction_matrix = final_prediction_matrix.sort_index()



movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1            4     3     4     4     4     4     4     3     4     4  ...   
3            3     3     3     3     3     3     3     3     3     3  ...   
4            4     4     4     4     4     4     4     4     4     4  ...   
16           5     4     4     4     4     4     5     4     4     4  ...   
22           3     3     3     3     3     3     3     3     3     3  ...   

movie_id  1671  1672  1674  1675  1676  1677  1679  1680  1681  1682  
user_id                                                               
1            4     4     4     4     4     4     4     4     4     4  
3            3     3     3     3     3     3     3     3     3     3  
4            4     4     4     4     4     4     4     4     4     4  
16           4     4     4     4     4     4     4     4     4     4  
22           3     3     3     3  

In [10]:
def predict_all_user_ratings_clustered(rating_matrix, users_df, similarity='cosine'):
    
    # --- STEP 1: Predizioni per utenti noti ---
    all_predictions = []
    for label in users_df['cluster'].unique():
        pred = predict_cluster_ratings(rating_matrix, users_df, cluster_label=label, similarity=similarity)
        all_predictions.append(pred)

    final_prediction_matrix = pd.concat(all_predictions)
    final_prediction_matrix = final_prediction_matrix.sort_index()
    known_users = set(final_prediction_matrix.index)

    # --- STEP 2: Predizioni per utenti cold-start ---
    all_users = set(users_df['user_id'])
    cold_start_users = list(all_users - known_users)
    cold_preds = {}

    for uid in cold_start_users:
        cluster_row = users_df[users_df['user_id'] == uid]
        if cluster_row.empty:
            print(f"Utente {uid} non trovato.")
            continue

        cluster = cluster_row['cluster'].values[0]
        cluster_user_ids = users_df[users_df['cluster'] == cluster]['user_id']
        cluster_ratings = rating_matrix.loc[rating_matrix.index.intersection(cluster_user_ids)]

        if cluster_ratings.empty:
            print(f"Cluster {cluster} vuoto per cold-start {uid}")
            continue

        cold_preds[uid] = cluster_ratings.mean()

    cold_df = pd.DataFrame.from_dict(cold_preds, orient='index')

    # --- STEP 3: Unione ---
    full_matrix = pd.concat([final_prediction_matrix, cold_df])
    full_matrix = full_matrix.sort_index().reindex(columns=rating_matrix.columns).fillna(0)
    full_matrix.index.name = 'user_id'

    return full_matrix

In [11]:
pred_matrix = predict_all_user_ratings_clustered(rating_matrix, users_df, similarity='cosine')

In [12]:
prediction_df = pred_matrix.reset_index().melt(
    id_vars='user_id', 
    var_name='movie_id', 
    value_name='predicted_rating'
)
print(prediction_df.head())

   user_id movie_id  predicted_rating
0        1        1               4.0
1        2        1               4.0
2        3        1               3.0
3        4        1               4.0
4        5        1               3.0


In [13]:
def evaluate_ratings(ratings_test, prediction_df, thresholds=[3.0]):
    # Unione dei dataframe test e predizioni
    valutazione_df = ratings_test.merge(prediction_df, on=["user_id", "movie_id"], how="inner")

    # Calcolo RMSE e MAE (rimangono invariati per tutte le soglie)
    rmse = np.sqrt(mean_squared_error(valutazione_df["rating"], valutazione_df["predicted_rating"]))
    mae = mean_absolute_error(valutazione_df["rating"], valutazione_df["predicted_rating"])

    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print("")

    # Lista per salvare i risultati
    risultati = []

    for threshold in thresholds:
        # Binarizzazione in base alla soglia
        valutazione_df["true_label"] = (valutazione_df["rating"] >= threshold).astype(int)
        valutazione_df["predicted_label"] = (valutazione_df["predicted_rating"] >= threshold).astype(int)

        # Calcolo metriche di classificazione
        accuracy = accuracy_score(valutazione_df["true_label"], valutazione_df["predicted_label"])
        precision = precision_score(valutazione_df["true_label"], valutazione_df["predicted_label"], zero_division=0)
        recall = recall_score(valutazione_df["true_label"], valutazione_df["predicted_label"], zero_division=0)

        # Stampa risultati
        print(f"Threshold: {threshold}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print("")

        # Salva risultati in lista
        risultati.append({
            "threshold": threshold,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall
        })

    return pd.DataFrame(risultati)

In [14]:
thresholds = [2, 3, 4.0]
# Valutazione delle predizioni con le metriche di classificazione per la Similarità Coseno
evaluate_ratings(ratings_test, prediction_df, thresholds=thresholds)

RMSE: 1.0499
MAE: 0.7774

Threshold: 2
Accuracy: 0.9424
Precision: 0.9437
Recall: 0.9985

Threshold: 3
Accuracy: 0.8317
Precision: 0.8392
Recall: 0.9852

Threshold: 4.0
Accuracy: 0.6484
Precision: 0.6841
Recall: 0.6818



Unnamed: 0,threshold,accuracy,precision,recall
0,2.0,0.942362,0.943651,0.998476
1,3.0,0.831747,0.839239,0.985207
2,4.0,0.648391,0.684071,0.68175


In [15]:
def precision_recall_ap_multi_k(ratings_test, prediction_df, k_values=[5], rel_thresholds=[4]):
    # Unione dei dataframe test e predizioni
    valutazione_df = ratings_test.merge(prediction_df, on=["user_id", "movie_id"], how="inner")
    all_metrics = []

    for rel_threshold in rel_thresholds:
        for k in k_values:
            for user_id, group in valutazione_df.groupby('user_id'):
                # Ordinare per predicted_rating decrescente
                group_sorted = group.sort_values('predicted_rating', ascending=False)

                # Rilevanti reali (ground truth)
                relevant_items = set(group[group['rating'] >= rel_threshold]['movie_id'])

                # Top-k raccomandati
                recommended = group_sorted.head(k)
                recommended_ids = list(recommended['movie_id'])

                # Precision@k
                rel_k = [1 if movie in relevant_items else 0 for movie in recommended_ids]
                precision_at_k = sum(rel_k) / k

                # Recall@k
                recall_at_k = sum(rel_k) / len(relevant_items) if relevant_items else 0.0

                # Average Precision@k (AP@k)
                num_hits = 0
                sum_precisions = 0.0
                for i, hit in enumerate(rel_k):
                    if hit:
                        num_hits += 1
                        sum_precisions += num_hits / (i + 1)
                ap_at_k = sum_precisions / len(relevant_items) if relevant_items else 0.0

                all_metrics.append({
                    'user_id': user_id,
                    'k': k,
                    'rel_threshold': rel_threshold,
                    'precision@k': precision_at_k,
                    'recall@k': recall_at_k,
                    'ap@k': ap_at_k
                })

    # Tutti i risultati
    results_df = pd.DataFrame(all_metrics)

    # MAP@k per ogni combinazione di k e soglia
    mapk_summary = (
        results_df.groupby(['k', 'rel_threshold'])['ap@k']
        .mean()
        .reset_index()
        .rename(columns={'ap@k': 'MAP@k'})
    )

    return results_df, mapk_summary

In [16]:
k_list = [5, 10, 15, 20]
threshold_list = [3, 4]

risultati_utenti, mapk = precision_recall_ap_multi_k(ratings_test, prediction_df, k_list, threshold_list)
risultati_utenti.head()

print("MAP@k per ogni combinazione di k e soglia:")
print(mapk)

KeyboardInterrupt: 