In [141]:
# Cella 1
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from math import log2
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [142]:
# Caricamento dati
url = "../datasets/ml-100k/u.data"
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_csv(url, sep='\t', names=columns)
df.drop('timestamp', axis=1, inplace=True)

In [143]:
# Prepara matrici utente-film
df_ratings = df.copy()
ratings_matrix = df_ratings.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)


In [144]:
# Train-test split
train_data, test_data = train_test_split(df_ratings, test_size=0.2, random_state=42)
train_matrix = train_data.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)

In [145]:
# Calcolo similarità tra utenti
user_similarity = cosine_similarity(train_matrix)
similarity_df = pd.DataFrame(user_similarity, index=train_matrix.index, columns=train_matrix.index)

  ret = a @ b
  ret = a @ b
  ret = a @ b


In [146]:
def predict_rating(user_id, movie_id, k=5):
    if user_id not in train_matrix.index:
        return ratings_matrix.loc[user_id].mean()
    if movie_id not in train_matrix.columns:
        return train_matrix.loc[user_id].mean()
    sim_scores = similarity_df[user_id]
    movie_ratings = train_matrix[movie_id]
    valid_users = movie_ratings[movie_ratings > 0].index
    if len(valid_users) == 0:
        return train_matrix.loc[user_id].mean()
    top_users = sim_scores[valid_users].sort_values(ascending=False).head(k)
    weighted_sum = np.dot(top_users, movie_ratings[top_users.index])
    sim_sum = top_users.sum()
    return weighted_sum/sim_sum if sim_sum > 0 else train_matrix.loc[user_id].mean()

In [147]:
true_ratings = []
predicted_ratings = []
for _, row in test_data.iterrows():
    uid, mid, true_r = row['user_id'], row['movie_id'], row['rating']
    pred = predict_rating(uid, mid)
    true_ratings.append(true_r)
    predicted_ratings.append(round(pred))

In [None]:
# Metriche classificazione
acc = accuracy_score(true_ratings, predicted_ratings)
prec = precision_score(true_ratings, predicted_ratings, average='macro', zero_division=0)
rec = recall_score(true_ratings, predicted_ratings, average='macro', zero_division=0)

mae = mean_absolute_error(true_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))

print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")

Rating Prediction -> Accuracy: 0.361, Precision: 0.295, Recall: 0.223, F1: 0.219
MAE: 0.816, RMSE: 1.222


In [149]:
# Preparazione transazioni e regole associative
transactions = df_ratings[df_ratings['rating'] >= 4].groupby('user_id')['movie_id'].apply(list).tolist()
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
trans_df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(trans_df, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)
top_rules = rules.sort_values('lift', ascending=False).head(20)

In [150]:
# Funzione raccomandazioni ibride (CF + AR)
def hybrid_recommendations(user_id, top_n=10, cf_weight=0.7):
    # CF scores
    cf_scores = {m: predict_rating(user_id, m) for m in train_matrix.columns}
    # Normalizza CF
    min_cf, max_cf = min(cf_scores.values()), max(cf_scores.values())
    cf_scores = {m: (s-min_cf)/(max_cf-min_cf) if max_cf>min_cf else 0 for m, s in cf_scores.items()}
    # AR scores
    user_high = set(df_ratings[(df_ratings['user_id']==user_id) & (df_ratings['rating']>=4)]['movie_id'])
    ar_scores = {m:0 for m in train_matrix.columns}
    for _, rule in top_rules.iterrows():
        A, C = set(rule['antecedents']), set(rule['consequents'])
        if A.issubset(user_high):
            for m in C:
                if m in ar_scores:
                    ar_scores[m] += rule['confidence'] * rule['lift']
    min_ar, max_ar = min(ar_scores.values()), max(ar_scores.values())
    ar_scores = {m: (s-min_ar)/(max_ar-min_ar) if max_ar>min_ar else 0 for m, s in ar_scores.items()}
    # Combina e filtra
    final = {m: cf_weight*cf_scores[m] + (1-cf_weight)*ar_scores[m] for m in train_matrix.columns}
    seen = set(df_ratings[df_ratings['user_id']==user_id]['movie_id'])
    recs = [m for m,_ in sorted(final.items(), key=lambda x: x[1], reverse=True) if m not in seen]
    return recs[:top_n]

In [151]:
# Metriche Top-N (Precision@K, Recall@K, NDCG@K)
def precision_at_k(recommended, actual, k):
    rec_k = set(recommended[:k])
    return len(rec_k & set(actual)) / k

def recall_at_k(recommended, actual, k):
    return len(set(recommended[:k]) & set(actual)) / len(actual) if actual else 0

def ndcg_at_k(recommended, actual, k):
    dcg = sum((1/log2(i+1)) for i, m in enumerate(recommended[:k], start=1) if m in actual)
    idcg = sum(1/log2(i+1) for i in range(1, min(len(actual), k)+1))
    return dcg/idcg if idcg>0 else 0

In [152]:
# Valutazione Top-N per utenti di test
ks = [5, 10]
results = {f'P@{k}': [] for k in ks}
results.update({f'R@{k}': [] for k in ks})
results.update({f'NDCG@{k}': [] for k in ks})

for u in test_data['user_id'].unique():
    actual = test_data[test_data['user_id']==u]['movie_id'].tolist()
    recs = hybrid_recommendations(u, top_n=10)
    for k in ks:
        results[f'P@{k}'].append(precision_at_k(recs, actual, k))
        results[f'R@{k}'].append(recall_at_k(recs, actual, k))
        results[f'NDCG@{k}'].append(ndcg_at_k(recs, actual, k))

for metric, vals in results.items():
    print(f"{metric}: {np.mean(vals):.3f}")

P@5: 0.000
P@10: 0.000
R@5: 0.000
R@10: 0.000
NDCG@5: 0.000
NDCG@10: 0.000


In [154]:
# Esempio utilizzo raccomandazioni ibride
user_test = 1
rec = hybrid_recommendations(user_test, top_n=10, cf_weight=0.7)
print(f"Raccomandazioni ibride utente {user_test}:", rec)

Raccomandazioni ibride utente 1: [850, 1189, 1201, 1293, 1306, 1467, 1500, 1612, 1629, 1653]


In [153]:
# Visualizzazione film raccomandati
rec = hybrid_recommendations(u, top_n=10)

movies = pd.read_csv("../datasets/ml-100k/u.item", sep="|", encoding="latin-1", 
                     names=["movie_id", "title", "release_date", "video_release_date", "IMDb_URL", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])
film = []
for r in rec:
    film.append(movies.loc[r, "title"])
film

['Two or Three Things I Know About Her (1966)',
 'That Old Feeling (1997)',
 'Maybe, Maybe Not (Bewegte Mann, Der) (1994)',
 'Cure, The (1995)',
 'Prisoner of the Mountains (Kavkazsky Plennik) (1996)',
 'Silence of the Palace, The (Saimt el Qusur) (1994)',
 'Angel Baby (1995)',
 'Chairman of the Board (1998)',
 'Ayn Rand: A Sense of Life (1997)',
 'Tokyo Fist (1995)']