## Métricas para el sistema de recomendación

### MAE (Mean Absolute Error)

In [1]:
import pandas as pd
import numpy as np
from data_reader import *

In [2]:
from sklearn.model_selection import train_test_split

md_genres, ratings, md = read_data()
# ratings.drop('timestamp', axis=1, inplace=True)
rate = ratings.loc[ratings['rating'] != 0]

train_data, test_data = train_test_split(rate, test_size=0.3, random_state=42)


In [3]:
def build_matrix(ratings, md_genres):
    """
    Crea la matriz híbrida utilizada para calcular las similitudes entre los usuarios y sus preferencias de películas.

    Returns
    -------
    hybrid_matrix : DataFrame
        Un DataFrame que representa las preferencias de los usuarios por diferentes géneros.
    user_movie_matrix : DataFrame
        Un DataFrame que representa las calificaciones de los usuarios para diferentes películas.
    md : DataFrame
        Un DataFrame con las películas y sus géneros, donde los géneros están en una sola columna.
    """
    merged = pd.merge(ratings, md_genres, on='movieId')
    merged['rating>3'] = (merged['rating'] > 3).astype(int)

    grouped = merged.groupby(['userId', 'genres'])['rating>3'].sum().reset_index()
    grouped = grouped[grouped['genres'] != '(no genres listed)']

    pivot_table = grouped.pivot(index='userId', columns='genres', values='rating>3').fillna(0)
    total_ratings = pivot_table.sum(axis=1)
    proportions = pivot_table.div(total_ratings, axis=0)

    likes_many_X_movies = pd.DataFrame(0, index=proportions.index, columns=proportions.columns)
    likes_some_X_movies = pd.DataFrame(0, index=proportions.index, columns=proportions.columns)

    for user in proportions.index:
        sorted_genres = proportions.loc[user].sort_values(ascending=False)
        n_genres = len(sorted_genres)
        if n_genres == 0:
            continue
        first_third = int(n_genres / 3)
        second_third = int(2 * n_genres / 3)
        likes_many_X_movies.loc[user, sorted_genres.index[:first_third]] = 1
        likes_some_X_movies.loc[user, sorted_genres.index[first_third:second_third]] = 1

    likes_many_X_movies_prefixed = likes_many_X_movies.add_prefix('likes_many_')
    likes_some_X_movies_prefixed = likes_some_X_movies.add_prefix('likes_some_')
    interleaved_columns = np.array(list(zip(likes_many_X_movies_prefixed.columns, likes_some_X_movies_prefixed.columns))).flatten()
    likes_many_X_movies_prefixed = likes_many_X_movies_prefixed.reindex(columns=interleaved_columns)
    likes_some_X_movies_prefixed = likes_some_X_movies_prefixed.reindex(columns=interleaved_columns)

    hybrid_matrix = pd.DataFrame()

    for col_many, col_some in zip(likes_many_X_movies_prefixed.columns, likes_some_X_movies_prefixed.columns):
        if not likes_many_X_movies_prefixed[col_many].isna().all():
            hybrid_matrix = pd.concat([hybrid_matrix, likes_many_X_movies_prefixed[col_many]], axis=1)
        if not likes_some_X_movies_prefixed[col_some].isna().all():
            hybrid_matrix = pd.concat([hybrid_matrix, likes_some_X_movies_prefixed[col_some]], axis=1)

    user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    return hybrid_matrix, user_movie_matrix, md


In [4]:
def find_neighbors(hybrid_matrix, user_id):
    """
    Encuentra los vecinos más cercanos (usuarios similares) para el usuario objetivo.

    Parameters
    ----------
    hybrid_matrix : DataFrame
        Un DataFrame que representa las preferencias de los usuarios por diferentes géneros.
    user_id : int
        El ID del usuario para el cual se quieren encontrar vecinos.

    Returns
    -------
    list of tuples
        Una lista de tuplas que representan los vecinos más cercanos y sus puntajes de similitud.
    """
    num_rows = hybrid_matrix.shape[0] - 1
    k = int(num_rows * 0.1)

    similitudes = {}
    user_vector = hybrid_matrix.loc[user_id]
    for other_user in hybrid_matrix.index:

        if other_user == user_id:
            continue
        other_user_vector = hybrid_matrix.loc[other_user]
        sim = np.dot(user_vector, other_user_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(other_user_vector))
        similitudes[other_user] = sim

    sorted_neighbors = sorted(similitudes.items(), key=lambda x: x[1], reverse=True)

    return sorted_neighbors[:k]

In [5]:

def predict_user_rating(user_id,ratings, item, neighbors):
      """
      Predice la calificación que un usuario podría dar a una película específica basada en las calificaciones de sus vecinos.

      Parameters
      ----------
      ratings : DataFrame
          Un DataFrame que representa las calificaciones de los usuarios para diferentes películas.
      item : int
          El ID de la película para la cual se quiere predecir la calificación.
      neighbors : list of tuples
          Una lista de tuplas que representan los vecinos más cercanos y sus puntajes de similitud.
      user_id : int
          El ID del usuario para el cual se quiere predecir la calificación.

      Returns
      -------
      float
          La calificación predicha para la película especificada.
      """
      numerator = 0
      denominator = 0
      for neighbor, similarity in neighbors:
          neighbor_rating = ratings.loc[neighbor, item]
          neighbor_avg = ratings.loc[neighbor].mean()
          if neighbor_rating > 0:
              numerator += similarity * (neighbor_rating - neighbor_avg)
              denominator += abs(similarity)
      if denominator == 0:
          return 0
      user_avg = ratings.loc[user_id].mean()
      result = user_avg + (numerator / denominator)
      return result

In [6]:
def recommend_movies(test_data,md_genres):
    """
    Genera una lista de películas recomendadas para el usuario objetivo.

    Returns
    -------
    DataFrame
        Un DataFrame que contiene las películas recomendadas y sus calificaciones predichas.
    """
    results = []
    hybrid_matrix, ratings, movies = build_matrix(test_data, md_genres)


    for user_id in test_data['userId'].unique():
        neighbors = find_neighbors(hybrid_matrix, user_id)
        predicted_rating = []
        user_rows = test_data.loc[(test_data['userId'] == user_id)]

        for movieId in user_rows['movieId']:
            rate = round(predict_user_rating(user_id, ratings, movieId, neighbors), 1)
            if(rate):
                predicted_rating.append((user_id,movieId, rate))


        results.append(predicted_rating)

    return results

# predictions = recommend_movies(test_data,md_genres)


In [3]:
from recommender import *

rec = Recommender(1)
predictions = rec.recommend_movies_for_test(test_data, md_genres, ratings, md )


NameError: name 'rates' is not defined

In [7]:
pred = [item for sublist in predictions for item in sublist]

In [8]:
df_predictions = pd.DataFrame(pred, columns=['userId', 'movieId', 'rating'])

df_recommendation = df_predictions.loc[df_predictions['rating'] > 3]

In [9]:
numerador = 0
denominador = 0

for row in df_predictions.itertuples():
    user_id, movie_id, rating = row.userId, row.movieId, row.rating
    actual_rating = test_data[(test_data['userId'] == user_id) & (test_data['movieId'] == movie_id)]['rating'].values[0]
    numerador += abs(rating - actual_rating)
    denominador += 1

mae = numerador / denominador if denominador != 0 else 0

r_max = test_data['rating'].max()
r_min = test_data['rating'].min()

nmae = mae / (r_max - r_min)

rmse = np.sqrt(np.mean((df_predictions['rating'] - test_data['rating']) ** 2))

print(f'MAE: {mae}')
print(f'NMAE: {nmae}')
print(f'RMSE: {rmse}')

MAE: 0.8289729877965243
NMAE: 0.18421621951033873
RMSE: 1.2788740258314577


In [16]:
test_data_recommended = pd.merge(df_recommendation[['userId', 'movieId']], test_data, on=['userId', 'movieId'], how='inner')
valid = test_data_recommended.loc[test_data_recommended['rating'] > 3]

# Inicializar listas para almacenar las métricas por usuario
precisions = []
recalls = []
f1_scores = []

# Inicializar listas para los cálculos micro
hits_per_user = []
recset_per_user = []
testset_per_user = []

for user in valid['userId'].unique():
    print(f'User: {user}')

    # Calcular hits, recset y testset por usuario
    hits = valid.loc[valid['userId'] == user].shape[0]
    recset = df_recommendation.loc[df_recommendation['userId'] == user].shape[0]
    testset = test_data.loc[test_data['userId'] == user].shape[0]

    # Calcular Precision, Recall y F1 por usuario
    Precision = hits / recset if recset > 0 else 0
    Recall = hits / testset if testset > 0 else 0
    F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) > 0 else 0

    # Almacenar las métricas calculadas
    precisions.append(Precision)
    recalls.append(Recall)
    f1_scores.append(F1)

    # Almacenar los valores para el cálculo micro
    hits_per_user.append(hits)
    recset_per_user.append(recset)
    testset_per_user.append(testset)

    print(f'Precision: {Precision}')
    print(f'Recall: {Recall}')
    print(f'F1: {F1}')


User: 432
Precision: 0.6808510638297872
Recall: 0.4050632911392405
F1: 0.5079365079365079
User: 288
Precision: 0.5277777777777778
Recall: 0.2745664739884393
F1: 0.3612167300380228
User: 599
Precision: 0.29941860465116277
Recall: 0.14325452016689846
F1: 0.19379115710254
User: 42
Precision: 0.6842105263157895
Recall: 0.312
F1: 0.4285714285714286
User: 75
Precision: 0.5384615384615384
Recall: 0.3333333333333333
F1: 0.41176470588235287
User: 51
Precision: 0.7959183673469388
Recall: 0.38235294117647056
F1: 0.5165562913907285
User: 354
Precision: 0.8771929824561403
Recall: 0.684931506849315
F1: 0.7692307692307693
User: 416
Precision: 0.6
Recall: 0.2727272727272727
F1: 0.37499999999999994
User: 438
Precision: 0.7666666666666667
Recall: 0.36507936507936506
F1: 0.49462365591397855
User: 73
Precision: 0.75
Recall: 0.45454545454545453
F1: 0.5660377358490566
User: 567
Precision: 0.1346153846153846
Recall: 0.05511811023622047
F1: 0.0782122905027933
User: 391
Precision: 0.7536231884057971
Recall: 0.

In [17]:
# Calcular métricas macro
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1 = sum(f1_scores) / len(f1_scores)

# Calcular métricas micro
total_hits = sum(hits_per_user)
total_recset = sum(recset_per_user)
total_testset = sum(testset_per_user)

micro_precision = total_hits / total_recset if total_recset > 0 else 0
micro_recall = total_hits / total_testset if total_testset > 0 else 0
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0

# Imprimir métricas globales
print(f'Average Precision (Macro): {avg_precision:.4f}')
print(f'Average Recall (Macro): {avg_recall:.4f}')
print(f'Average F1 (Macro): {avg_f1:.4f}')

print(f'Precision (Micro): {micro_precision:.4f}')
print(f'Recall (Micro): {micro_recall:.4f}')
print(f'F1 (Micro): {micro_f1:.4f}')


Average Precision (Macro): 0.7382
Average Recall (Macro): 0.4613
Average F1 (Macro): 0.5558
Precision (Micro): 0.7017
Recall (Micro): 0.4010
F1 (Micro): 0.5103


In [20]:
# Función para calcular Average Precision (AP) por usuario
def average_precision(predicted_ratings, isHitFunc, getPropertyFunc):
    rel = 0
    numerator = 0
    for index, rating in enumerate(predicted_ratings):
        if isHitFunc(getPropertyFunc(rating)):
            rel += 1
            numerator += (rel / (index + 1))
    if rel > 0:
        return numerator / rel
    else:
        return 0

# Función para calcular el MAP
def mean_average_precision(df_recommendation, test_data):
    # Inicializar la lista para almacenar los AP de cada usuario
    average_precisions = []

    # Iterar sobre cada usuario único en las recomendaciones
    for user in df_recommendation['userId'].unique():
        # Obtener las recomendaciones para el usuario actual
        user_recommendations = df_recommendation[df_recommendation['userId'] == user]

        # Obtener las calificaciones reales para el usuario en el conjunto de prueba
        user_valid = valid[valid['userId'] == user]

        # Calcular el Average Precision para el usuario actual
        ap = average_precision(
            user_recommendations.itertuples(),
            lambda movieId: movieId in user_valid['movieId'].values,
            lambda row: row.movieId
        )

        # Añadir el AP calculado a la lista
        average_precisions.append(ap)

    # Calcular el MAP como el promedio de todos los AP
    if len(average_precisions) > 0:
        return np.mean(average_precisions)
    else:
        return 0

map_score = mean_average_precision(df_recommendation, test_data)
print(f'MAP: {map_score:.4f}')


MAP: 0.7731
