In [54]:
# Cella 1
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Per le regole associative
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [55]:
# Caricamento dati
url = "../datasets/ml-100k/u.data"
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_csv(url, sep='\t', names=columns)
df.drop('timestamp', axis=1, inplace=True)

In [56]:
# Creazione matrice utente-film
ratings_matrix = df.pivot_table(index='user_id', columns='movie_id', values='rating')
ratings_matrix.fillna(0, inplace=True)


In [57]:
# Divisione train-test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_matrix = train_data.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)

In [58]:
# Calcolo similarità tra utenti
user_similarity = cosine_similarity(train_matrix)
similarity_df = pd.DataFrame(user_similarity, index=train_matrix.index, columns=train_matrix.index)

  ret = a @ b
  ret = a @ b
  ret = a @ b


In [59]:
# Funzione per prevedere il rating
def predict_rating(user_id, movie_id, k=5):
    sim_scores = similarity_df[user_id]
    movie_ratings = train_matrix[movie_id]
    valid_users = movie_ratings[movie_ratings > 0].index
    top_users = sim_scores[valid_users].sort_values(ascending=False).head(k)
    weighted_sum = np.dot(top_users, movie_ratings[top_users.index])
    sim_sum = top_users.sum()
    if sim_sum == 0:
        return train_matrix.loc[user_id].mean()
    return weighted_sum / sim_sum

In [60]:
# Preparazione dei dati per le regole associative
transactions = (
    df[df['rating'] >= 4]
      .groupby('user_id')['movie_id']
      .apply(list)
      .tolist()
)

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
trans_df = pd.DataFrame(te_ary, columns=te.columns_)

In [61]:
frequent_itemsets = apriori(trans_df, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)

In [62]:
# Stampa delle prime regole ordinate per lift
top_rules = rules.sort_values('lift', ascending=False).head(10)
print(top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

              antecedents         consequents   support  confidence      lift
15108      (50, 195, 181)      (96, 172, 174)  0.105096    0.727941  4.633247
15097      (96, 172, 174)      (50, 195, 181)  0.105096    0.668919  4.633247
15101      (96, 181, 174)      (50, 195, 172)  0.105096    0.744361  4.613079
15104      (50, 195, 172)      (96, 181, 174)  0.105096    0.651316  4.613079
15084  (96, 50, 172, 174)          (195, 181)  0.105096    0.692308  4.592633
15118          (195, 181)  (96, 50, 172, 174)  0.105096    0.697183  4.592633
14823          (195, 181)      (96, 172, 174)  0.107219    0.711268  4.527122
14810      (96, 172, 174)          (195, 181)  0.107219    0.682432  4.527122
14821          (195, 172)      (96, 181, 174)  0.107219    0.631250  4.470959
14812      (96, 181, 174)          (195, 172)  0.107219    0.759398  4.470959


In [None]:
# Funzione per raccomandazioni ibride (CF + Association Rules)
def hybrid_recommendations(user_id, top_n=10, cf_weight=0.7):
    cf_scores = {}
    # Calcola CF solo su film del training
    for movie in train_matrix.columns:
        cf_scores[movie] = predict_rating(user_id, movie, k=5) if user_id in train_matrix.index else 0
    # Normalizza CF
    max_cf, min_cf = max(cf_scores.values()), min(cf_scores.values())
    cf_scores = {m: (s - min_cf)/(max_cf - min_cf) for m, s in cf_scores.items()} if max_cf>min_cf else cf_scores

    # Punteggi AR   
    user_high = set(df[(df['user_id']==user_id) & (df['rating']>=4)]['movie_id'])
    ar_scores = {m:0 for m in train_matrix.columns}
    for _, rule in top_rules.iterrows():
        antecedents = set(rule['antecedents'])
        consequents = set(rule['consequents'])
        if antecedents.issubset(user_high):
            for m in consequents:
                if m in ar_scores:
                    ar_scores[m] += rule['confidence'] * rule['lift']
    # Normalizza AR
    max_ar, min_ar = max(ar_scores.values()), min(ar_scores.values())
    ar_scores = {m: (s - min_ar)/(max_ar - min_ar) for m, s in ar_scores.items()} if max_ar>min_ar else ar_scores

    # Combina CF e AR
    final_scores = {m: cf_weight*cf_scores.get(m,0) + (1-cf_weight)*ar_scores.get(m,0) for m in train_matrix.columns}
    seen = set(df[df['user_id']==user_id]['movie_id'])
    candidates = {m: s for m, s in final_scores.items() if m not in seen}
    recs = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [m for m, _ in recs]


In [64]:
user_test = 1
if user_test in ratings_matrix.index:
    print(f"Raccomandazioni ibride per utente {user_test}:", hybrid_recommendations(user_test, top_n=10, cf_weight=0.7))
else:
    print(f"L'utente {user_test} non è presente nei dati di training.")

NameError: name 'user_high' is not defined

In [None]:
# Valutazione del modello
true_ratings = []
predicted_ratings = []
for _, row in test_data.iterrows():
    uid, mid, true_rating = row['user_id'], row['movie_id'], row['rating']
    pred = predict_rating(uid, mid, k=5)
    true_ratings.append(true_rating)
    predicted_ratings.append(round(pred))

accuracy = accuracy_score(true_ratings, predicted_ratings)
precision = precision_score(true_ratings, predicted_ratings, average='macro')
recall = recall_score(true_ratings, predicted_ratings, average='macro')
f1 = f1_score(true_ratings, predicted_ratings, average='macro')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


KeyError: np.int64(1640)