## Métricas para el sistema de recomendación

### MAE (Mean Absolute Error)

In [2]:
import pandas as pd
import numpy as np
from data_reader import *

In [18]:
from sklearn.model_selection import train_test_split

md_genres, ratings, md = read_data()
ratings.drop('timestamp', axis=1, inplace=True)
rate = ratings.loc[ratings['rating'] != 0]

split_idx = int(0.8 * len(rate))
test_data = rate[split_idx:]


In [20]:
def build_matrix(ratings, md_genres):
    """
    Crea la matriz híbrida utilizada para calcular las similitudes entre los usuarios y sus preferencias de películas.

    Returns
    -------
    hybrid_matrix : DataFrame
        Un DataFrame que representa las preferencias de los usuarios por diferentes géneros.
    user_movie_matrix : DataFrame
        Un DataFrame que representa las calificaciones de los usuarios para diferentes películas.
    md : DataFrame
        Un DataFrame con las películas y sus géneros, donde los géneros están en una sola columna.
    """
    merged = pd.merge(ratings, md_genres, on='movieId')
    merged['rating>3'] = (merged['rating'] > 3).astype(int)

    grouped = merged.groupby(['userId', 'genres'])['rating>3'].sum().reset_index()
    grouped = grouped[grouped['genres'] != '(no genres listed)']

    pivot_table = grouped.pivot(index='userId', columns='genres', values='rating>3').fillna(0)
    total_ratings = pivot_table.sum(axis=1)
    proportions = pivot_table.div(total_ratings, axis=0)

    likes_many_X_movies = pd.DataFrame(0, index=proportions.index, columns=proportions.columns)
    likes_some_X_movies = pd.DataFrame(0, index=proportions.index, columns=proportions.columns)

    for user in proportions.index:
        sorted_genres = proportions.loc[user].sort_values(ascending=False)
        n_genres = len(sorted_genres)
        if n_genres == 0:
            continue
        first_third = int(n_genres / 3)
        second_third = int(2 * n_genres / 3)
        likes_many_X_movies.loc[user, sorted_genres.index[:first_third]] = 1
        likes_some_X_movies.loc[user, sorted_genres.index[first_third:second_third]] = 1

    likes_many_X_movies_prefixed = likes_many_X_movies.add_prefix('likes_many_')
    likes_some_X_movies_prefixed = likes_some_X_movies.add_prefix('likes_some_')
    interleaved_columns = np.array(list(zip(likes_many_X_movies_prefixed.columns, likes_some_X_movies_prefixed.columns))).flatten()
    likes_many_X_movies_prefixed = likes_many_X_movies_prefixed.reindex(columns=interleaved_columns)
    likes_some_X_movies_prefixed = likes_some_X_movies_prefixed.reindex(columns=interleaved_columns)

    hybrid_matrix = pd.DataFrame()

    for col_many, col_some in zip(likes_many_X_movies_prefixed.columns, likes_some_X_movies_prefixed.columns):
        if not likes_many_X_movies_prefixed[col_many].isna().all():
            hybrid_matrix = pd.concat([hybrid_matrix, likes_many_X_movies_prefixed[col_many]], axis=1)
        if not likes_some_X_movies_prefixed[col_some].isna().all():
            hybrid_matrix = pd.concat([hybrid_matrix, likes_some_X_movies_prefixed[col_some]], axis=1)

    user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    return hybrid_matrix, user_movie_matrix, md


In [21]:
def find_neighbors(hybrid_matrix, user_id):
    """
    Encuentra los vecinos más cercanos (usuarios similares) para el usuario objetivo.

    Parameters
    ----------
    hybrid_matrix : DataFrame
        Un DataFrame que representa las preferencias de los usuarios por diferentes géneros.
    user_id : int
        El ID del usuario para el cual se quieren encontrar vecinos.

    Returns
    -------
    list of tuples
        Una lista de tuplas que representan los vecinos más cercanos y sus puntajes de similitud.
    """
    num_rows = hybrid_matrix.shape[0] - 1
    k = int(num_rows * 0.15)

    similitudes = {}
    user_vector = hybrid_matrix.loc[user_id]
    for other_user in hybrid_matrix.index:

        if other_user == user_id:
            continue
        other_user_vector = hybrid_matrix.loc[other_user]
        sim = np.dot(user_vector, other_user_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(other_user_vector))
        similitudes[other_user] = sim

    sorted_neighbors = sorted(similitudes.items(), key=lambda x: x[1], reverse=True)

    return sorted_neighbors[:k]

In [22]:

def predict_user_rating(user_id,ratings, item, neighbors):
      """
      Predice la calificación que un usuario podría dar a una película específica basada en las calificaciones de sus vecinos.

      Parameters
      ----------
      ratings : DataFrame
          Un DataFrame que representa las calificaciones de los usuarios para diferentes películas.
      item : int
          El ID de la película para la cual se quiere predecir la calificación.
      neighbors : list of tuples
          Una lista de tuplas que representan los vecinos más cercanos y sus puntajes de similitud.
      user_id : int
          El ID del usuario para el cual se quiere predecir la calificación.

      Returns
      -------
      float
          La calificación predicha para la película especificada.
      """
      numerator = 0
      denominator = 0
      for neighbor, similarity in neighbors:
          neighbor_rating = ratings.loc[neighbor, item]
          neighbor_avg = ratings.loc[neighbor].mean()
          if neighbor_rating > 0:
              numerator += similarity * (neighbor_rating - neighbor_avg)
              denominator += abs(similarity)
      if denominator == 0:
          return 0
      user_avg = ratings.loc[user_id].mean()
      result = user_avg + (numerator / denominator)
      return result


In [23]:
def recommend_movies(test_data,md_genres):
    """
    Genera una lista de películas recomendadas para el usuario objetivo.

    Returns
    -------
    DataFrame
        Un DataFrame que contiene las películas recomendadas y sus calificaciones predichas.
    """
    results = []
    hybrid_matrix, ratings, movies = build_matrix(test_data, md_genres)


    for user_id in test_data['userId'].unique():
        neighbors = find_neighbors(hybrid_matrix, user_id)
        predicted_rating = []
        user_rows = test_data.loc[(test_data['userId'] == user_id)]

        for movieId in user_rows['movieId']:
            rate = round(predict_user_rating(user_id, ratings, movieId, neighbors), 1)
            if(rate):
                predicted_rating.append((user_id,movieId, rate))


        results.append(predicted_rating)

    return results

predictions = recommend_movies(test_data,md_genres)


In [24]:
pred = [item for sublist in predictions for item in sublist]

In [25]:
df_predictions = pd.DataFrame(pred, columns=['userId', 'movieId', 'rating'])

df_recommendation = df_predictions.loc[df_predictions['rating'] > 3]

In [26]:
numerador = 0
denominador = 0

for row in df_predictions.itertuples():
    user_id, movie_id, rating = row.userId, row.movieId, row.rating
    actual_rating = test_data[(test_data['userId'] == user_id) & (test_data['movieId'] == movie_id)]['rating'].values[0]
    numerador += abs(rating - actual_rating)
    denominador += 1

mae = numerador / denominador if denominador != 0 else 0

r_max = test_data['rating'].max()
r_min = test_data['rating'].min()

nmae = mae / (r_max - r_min)

rmse = np.sqrt(np.mean((df_predictions['rating'] - test_data['rating']) ** 2))

print(f'MAE: {mae}')
print(f'NMAE: {nmae}')
print(f'RMSE: {rmse}')

MAE: 0.9275069289835963
NMAE: 0.20611265088524364
RMSE: nan


In [27]:
test_data_recommendated = pd.merge(df_recommendation[['userId', 'movieId']], test_data, on=['userId', 'movieId'], how='inner')
valid = test_data_recommendated.loc[test_data_recommendated['rating'] > 3]

for user in valid['userId'].unique():
    print(f'User: {user}')
    hits = valid.loc[valid['userId'] == user].shape[0]
    recset = df_predictions.loc[df_predictions['userId'] == user].shape[0]
    testset = test_data.loc[test_data['userId'] == user].shape[0]

    Precision = hits / recset
    Recall = hits / testset
    F1 = 2 * (Precision * Recall) / (Precision + Recall)
    print(f'Precision: {Precision}')
    print(f'Recall: {Recall}')
    print(f'F1: {F1}')

# hits = valid.shape[0]
# recset = df_predictions.shape[0]
# testset = train_data.shape[0]

# Precision = hits / recset
# Recall = hits / testset
# F1 = 2 * (Precision * Recall) / (Precision + Recall)
# print(f'Precision: {Precision}')
# print(f'Recall: {Recall}')
# print(f'F1: {F1}')


User: 450
Precision: 0.4666666666666667
Recall: 0.30434782608695654
F1: 0.3684210526315789
User: 451
Precision: 0.5
Recall: 0.47058823529411764
F1: 0.48484848484848486
User: 452
Precision: 0.6597938144329897
Recall: 0.6336633663366337
F1: 0.6464646464646465
User: 453
Precision: 0.5357142857142857
Recall: 0.4340836012861736
F1: 0.4795737122557726
User: 454
Precision: 0.5641025641025641
Recall: 0.4888888888888889
F1: 0.5238095238095238
User: 455
Precision: 0.32727272727272727
Recall: 0.3157894736842105
F1: 0.3214285714285714
User: 456
Precision: 0.42857142857142855
Recall: 0.3488372093023256
F1: 0.38461538461538464
User: 457
Precision: 0.42
Recall: 0.42
F1: 0.41999999999999993
User: 458
Precision: 0.6415094339622641
Recall: 0.576271186440678
F1: 0.6071428571428571
User: 459
Precision: 0.7692307692307693
Recall: 0.7692307692307693
F1: 0.7692307692307693
User: 460
Precision: 0.8450704225352113
Recall: 0.7317073170731707
F1: 0.7843137254901961
User: 461
Precision: 0.625
Recall: 0.5555555555