In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_absolute_error, root_mean_squared_error


In [None]:
ratings_url = '../datasets/ml-100k/u.data'
columns = ['user_id', 'item_id', 'rating', 'timestamp']

ratings = pd.read_csv(ratings_url, sep='\t', names=columns)
ratings = ratings.drop(columns=('timestamp'))
ratings.head()

In [None]:
rating_matrix = ratings.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)
rating_matrix.shape

In [None]:
rating_matrix

In [None]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

train_matrix = train_data.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)
test_matrix = test_data.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)

train_matrix = train_matrix.reindex(index=rating_matrix.index, columns=rating_matrix.columns, fill_value=0)
test_matrix = test_matrix.reindex(index=rating_matrix.index, columns=rating_matrix.columns, fill_value=0)

In [None]:
item_similarity = cosine_similarity(train_matrix.T)
item_similarity = pd.DataFrame(item_similarity, index=rating_matrix.columns, columns=rating_matrix.columns)

item_similarity.iloc[0, :5]

In [None]:
def recommend_items(user_id, train_matrix, similarity_matrix, N=10):
    # Rating dell'utente
    user_ratings = train_matrix.loc[user_id]
    # Score predetto per ogni item: somma weighted dei rating già espressi
    scores = similarity_matrix.dot(user_ratings) / np.abs(similarity_matrix).sum(axis=1)
    scores = pd.Series(scores, index=similarity_matrix.index)

    # Escludiamo gli item già valutati
    scores = scores[user_ratings == 0]
    # Top-N raccomandazioni
    top_n = scores.sort_values(ascending=False).head(N)
    return top_n.index.tolist()


recommend_items(1, train_matrix, item_similarity, N=5)

In [None]:
topN = 100

def evaluate(train_matrix, test_matrix, similarity_matrix, topN=10):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0

    y_true = []
    y_pred = []

    # Costruiamo vettori binari per ogni utente
    for user in train_matrix.index:
        # item realmente apprezzati nel test (rating >= 4)
        true_items = set(test_matrix.columns[test_matrix.loc[user] >= 4])
        # raccomandazioni
        recs = set(recommend_items(user, train_matrix, similarity_matrix, N=topN))

        # Calcolo TP, FP, FN, TN
        tp = len(recs & true_items)
        fp = len(recs - true_items)
        fn = len(true_items - recs)
        # per TN, consideriamo il resto degli item non raccomandati né nel test
        all_items = set(train_matrix.columns)
        non_recs = all_items - recs
        tn = len(non_recs - true_items)

        true_positives += tp
        false_positives += fp
        false_negatives += fn
        true_negatives += tn

        # MAE e RMSE: valutiamo i punteggi predetti vs. reali solo sugli item presenti nel test
        user_ratings = train_matrix.loc[user]
        scores = similarity_matrix.dot(user_ratings) / np.abs(similarity_matrix).sum(axis=1)
        for item in test_matrix.columns:
            if test_matrix.loc[user, item] > 0:
                y_true.append(test_matrix.loc[user, item])
                y_pred.append(scores[item])

    # Metriche globali
    accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    # Rimuovi valori NaN prima della valutazione
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = ~np.isnan(y_pred)
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    return accuracy, precision, recall, mae, rmse

In [None]:
acc, prec, rec, mae, rmse = evaluate(train_matrix, test_matrix, item_similarity, topN)
print(f"Accuracy: {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nMAE: {mae:.4f}\nRMSE: {rmse:.4f}")