#### Carga librerias

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from datetime import datetime
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
import random
from scipy.stats import pearsonr
from collections import Counter
import collections

#### Carga archivo

In [5]:
from google.colab import files
uploaded = files.upload()  # Te mostrará un botón para seleccionar el archivo Video_Games.csv

Saving Video_Games.csv to Video_Games.csv


In [6]:
df = pd.read_csv("Video_Games.csv", header=None)

df.columns = ['item_id', 'user_id', 'rating', 'timestamp']

df = df.drop('timestamp', axis=1)

print(f"Tamaño total del dataset: {df.shape}")

Tamaño total del dataset: (2565349, 3)


In [105]:
top_n_users = 100000 # Maximo de usuarios con los que queremos trabajar
min_ratings = 35   # Usuarios con al menos este número de ratings
user_counts = df['user_id'].value_counts()

dense_users = user_counts[user_counts >= min_ratings].index.tolist()

if len(dense_users) > top_n_users:
    dense_users = dense_users[:top_n_users]

print(f"Seleccionados {len(dense_users)} usuarios con al menos {min_ratings} ratings cada uno")

df_dense = df[df['user_id'].isin(dense_users)]

print(f"Tamaño del dataset filtrado: {df_dense.shape}")
print(f"Reducción del dataset: {100 * (1 - len(df_dense) / len(df)):.2f}%")

Seleccionados 1141 usuarios con al menos 35 ratings cada uno
Tamaño del dataset filtrado: (70580, 3)
Reducción del dataset: 97.25%


In [106]:
def random_split(data, test_ratio=0.2, random_state=42):
    data_shuffled = data.sample(frac=1, random_state=random_state)

    split = int(len(data) * (1 - test_ratio))

    train = data_shuffled.iloc[:split]
    test = data_shuffled.iloc[split:]

    train_users = set(train['user_id'].unique())
    train_items = set(train['item_id'].unique())
    test = test[test['user_id'].isin(train_users) & test['item_id'].isin(train_items)]

    return train, test

train_data, test_data = random_split(df_dense)
print(f"Tamaño del conjunto de entrenamiento: {train_data.shape}")
print(f"Tamaño del conjunto de prueba: {test_data.shape}")

train_users = set(train_data['user_id'].unique())
train_items = set(train_data['item_id'].unique())
common_users = len(train_users.intersection(set(test_data['user_id'].unique())))
common_items = len(train_items.intersection(set(test_data['item_id'].unique())))

print(f"Usuarios comunes en train y test: {common_users}/{len(train_users)}")
print(f"Items comunes en train y test: {common_items}/{len(train_items)}")

Tamaño del conjunto de entrenamiento: (56464, 3)
Tamaño del conjunto de prueba: (11762, 3)
Usuarios comunes en train y test: 1139/1141
Items comunes en train y test: 5962/18660


#### Creación de matrices

Mapeos para los datos de entrenamiento

In [107]:
train_users_list = train_data['user_id'].unique()
train_items_list = train_data['item_id'].unique()

user2idx = {user_id: idx for idx, user_id in enumerate(train_users_list)}
item2idx = {item_id: idx for idx, item_id in enumerate(train_items_list)}

idx2user = {idx: user for user, idx in user2idx.items()}
idx2item = {idx: item for item, idx in item2idx.items()}

NUM_USERS = len(user2idx)
NUM_ITEMS = len(item2idx)
MIN_RATING = df['rating'].min()
MAX_RATING = df['rating'].max()

print(f"Usuarios: {NUM_USERS}, Items: {NUM_ITEMS}")

Usuarios: 1141, Items: 18660


Matriz de train sparse

In [108]:
# Inicializar matriz vacía
train_matrix = sp.lil_matrix((NUM_USERS, NUM_ITEMS))

# Llenar con valores sin acumular
for _, row in train_data.iterrows():
    if row['user_id'] in user2idx and row['item_id'] in item2idx:
        u_idx = user2idx[row['user_id']]
        i_idx = item2idx[row['item_id']]
        # Asignar directamente, sin acumular
        train_matrix[u_idx, i_idx] = row['rating']

# Convertir a CSR para operaciones eficientes
train_matrix = train_matrix.tocsr()

Matriz de test sparse

In [109]:
test_matrix = sp.lil_matrix((NUM_USERS, NUM_ITEMS))

# Llenar con valores sin acumular
for _, row in test_data.iterrows():
    if row['user_id'] in user2idx and row['item_id'] in item2idx:
        u_idx = user2idx[row['user_id']]
        i_idx = item2idx[row['item_id']]
        # Asignar directamente, sin acumular
        test_matrix[u_idx, i_idx] = row['rating']

# Convertir a CSR para operaciones eficientes
test_matrix = test_matrix.tocsr()

Medidas

In [110]:
print(f"Forma de la matriz de train: {train_matrix.shape}")
print(f"Densidad de la matriz de train: {train_matrix.nnz / (NUM_USERS * NUM_ITEMS):.6f}")
print(f"Forma de la matriz de test: {test_matrix.shape}")
print(f"Densidad de la matriz de test: {test_matrix.nnz / (NUM_USERS * NUM_ITEMS):.6f}")

Forma de la matriz de train: (1141, 18660)
Densidad de la matriz de train: 0.002555
Forma de la matriz de test: (1141, 18660)
Densidad de la matriz de test: 0.000545


#### Cáclulo de similaridad

correlation similarity

In [111]:
def correlation_similarity(u, v):
    if u == v:
        return sp.identity(NUM_USERS, format='csr')

    u_items = set(train_matrix[u].indices)
    v_items = set(train_matrix[v].indices)
    common_items = u_items.intersection(v_items)

    if len(common_items) < 2:
        return sp.csr_matrix((1, NUM_USERS))

    u_ratings = [train_matrix[u, i] for i in common_items]
    v_ratings = [train_matrix[v, i] for i in common_items]

    avg_u = np.mean(u_ratings) if len(u_ratings) > 0 else 0
    avg_v = np.mean(v_ratings) if len(v_ratings) > 0 else 0

    num = 0
    den_u = 0
    den_v = 0

    for i in common_items:
        r_u = float(train_matrix[u, i])
        r_v = float(train_matrix[v, i])

        num += (r_u - avg_u) * (r_v - avg_v)
        den_u += (r_u - avg_u) ** 2
        den_v += (r_v - avg_v) ** 2

    if num == 0 or den_u == 0 or den_v == 0:
        return sp.csr_matrix((1, NUM_USERS))

    sim = num / (np.sqrt(den_u) * np.sqrt(den_v))

    return sp.csr_matrix(([sim], ([0], [v])), shape=(1, NUM_USERS))

jmsd similarity

In [112]:
def jmsd_similarity(u, v):
    if u == v:
        return sp.identity(NUM_USERS, format='csr')

    u_items = set(train_matrix[u].indices)
    v_items = set(train_matrix[v].indices)
    common_items = u_items.intersection(v_items)
    all_items = u_items.union(v_items)

    intersection = len(common_items)
    union = len(all_items)

    if intersection == 0:
        return sp.csr_matrix((1, NUM_USERS))

    rating_range = MAX_RATING - MIN_RATING

    diff_sum = 0
    for i in common_items:
        # Normalizamos las calificaciones
        r_u = (float(train_matrix[u, i]) - MIN_RATING) / rating_range
        r_v = (float(train_matrix[v, i]) - MIN_RATING) / rating_range

        diff_sum += (r_u - r_v) ** 2

    msd = diff_sum / intersection
    jaccard = intersection / union

    sim = jaccard * (1 - msd)

    return sp.csr_matrix(([sim], ([0], [v])), shape=(1, NUM_USERS))

Hacemos un pequeño test para comprobar que funciona.  
En la mayoria de casos da 0.0, aunque de vez en cuando encuentra usuarios con algo de similaridad.  
Esto ocurre porque nuestro dataset es bastante disperso

In [113]:
u, v = random.randint(0, NUM_USERS-1), random.randint(0, NUM_USERS-1)
corr_sim = correlation_similarity(u, v)[0, v] if u != v else 1.0
jmsd_sim = jmsd_similarity(u, v)[0, v] if u != v else 1.0
print(f"Similitud entre usuario {idx2user[u]} y {idx2user[v]}: Correlación = {corr_sim:.6f}, JMSD = {jmsd_sim:.6f}")

Similitud entre usuario A1RS06313BL6WN y AIUSVX34T9Z4O: Correlación = 0.000000, JMSD = 0.004858


#### Obtener vecinos

In [114]:
def get_neighbors(u, similarities):
    k = 25
    neighbors = []

    for v, sim in enumerate(similarities):
        if v != u and sim is not None:
            if sim.nnz > 0:
                sim_value = sim[0, v]
                neighbors.append((v, sim_value))

    neighbors.sort(key=lambda x: x[1], reverse=True)
    return neighbors[:k]

In [115]:
u = 112
similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]
neighbors_corr = get_neighbors(u, similarities_corr)
neighbors_jmsd = get_neighbors(u, similarities_jmsd)
print(neighbors_corr)
print(neighbors_jmsd)

[(180, np.float64(1.0000000000000002)), (53, np.float64(1.0)), (468, np.float64(0.9999999999999998)), (923, np.float64(0.9999999999999998)), (140, np.float64(0.7559289460184546)), (710, np.float64(0.6123724356957944)), (31, np.float64(0.5000000000000001)), (535, np.float64(0.5000000000000001)), (594, np.float64(0.5000000000000001)), (674, np.float64(0.5000000000000001)), (340, np.float64(0.3614486980061246)), (301, np.float64(0.25000000000000006)), (394, np.float64(0.25)), (974, np.float64(0.06939315030888374)), (486, np.float64(-0.5000000000000001)), (179, np.float64(-0.5940885257860046)), (1051, np.float64(-0.9999999999999998))]
[(719, np.float64(0.0806686046511628)), (394, np.float64(0.06994047619047618)), (784, np.float64(0.06818181818181818)), (974, np.float64(0.053797468354430375)), (580, np.float64(0.049218750000000006)), (662, np.float64(0.04578488372093023)), (237, np.float64(0.045454545454545456)), (884, np.float64(0.043750000000000004)), (646, np.float64(0.042385057471264365

#### Estimación de las predicciones
En el caso de que no haya los suficientes datos, se devolverá la media global del item. Es un término conocido como 'fallback' que se usa en casos reales

Media del item

In [116]:
def global_average(i):
    users_who_rated = [u for u in range(NUM_USERS) if train_matrix[u, i] != 0]
    if users_who_rated:
        return sum(train_matrix[u, i] for u in users_who_rated) / len(users_who_rated)
    return 3.0

Media

In [117]:
def average_prediction(u, i, neighbors):
    count = 0
    sum_ratings = 0

    for v, _ in neighbors:
        if train_matrix[v, i] != 0:
            sum_ratings += train_matrix[v, i]
            count += 1

    if count > 0:
        return sum_ratings / count

    return global_average(i)

In [118]:
u = 67
i = 1865
predicted_corr = average_prediction(u, i, neighbors_corr)
predicted_jmsd = average_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

4.444444444444445
4.444444444444445


Media ponderada

In [119]:
def weighted_average_prediction(u, i, neighbors):
    numerator = 0
    denominator = 0

    for v, sim in neighbors:
        if train_matrix[v, i] != 0:
            numerator += sim * train_matrix[v, i]
            denominator += abs(sim)

    if denominator > 0:
        return numerator / denominator

    return global_average(i)

In [120]:
u = 67
i = 571
predicted_corr = weighted_average_prediction(u, i, neighbors_corr)
predicted_jmsd = weighted_average_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

2.9999999999999996
5.0


Media de agregación

In [121]:
def deviation_from_mean_prediction(u, i, neighbors):
    u_ratings = train_matrix[u].data
    avg_u = np.mean(u_ratings) if len(u_ratings) > 0 else global_average(i)

    sum_deviation = 0
    count = 0

    for v, _ in neighbors:
        if train_matrix[v, i] != 0:
            v_ratings = train_matrix[v].data
            avg_v = np.mean(v_ratings) if len(v_ratings) > 0 else 0

            sum_deviation += (train_matrix[v, i] - avg_v)
            count += 1

    if count > 0:
        prediction = avg_u + (sum_deviation / count)
        return prediction

    return global_average(i)

In [122]:
u = 3
i = 2716
predicted_corr = deviation_from_mean_prediction(u, i, neighbors_corr)
predicted_jmsd = deviation_from_mean_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

1.0
1.0


#### Calculo de recomendaciones

In [123]:
def get_recommendations(predictions):
    recommendations = [None for _ in range(len(predictions))]

    for n in range(len(predictions)):
        max_value = None
        item = None

        for i, value in enumerate(predictions[n]):
            if value is not None and (max_value is None or value > max_value):
                max_value = value
                item = i

        recommendations[n] = item

    return recommendations

In [124]:
N = 5 #Vamos a usar 10 usuarios para mostrar recomendaciones
M = NUM_ITEMS

test_users = random.sample(range(NUM_USERS), N)

neighbors_corr = {}
neighbors_jmsd = {}
for u in test_users:
    similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
    similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]
    neighbors_corr[u] = get_neighbors(u, similarities_corr)
    neighbors_jmsd[u] = get_neighbors(u, similarities_jmsd)

In [125]:
avg_predictions_corr = [[None if train_matrix[u, i] != 0 else average_prediction(u, i, neighbors_corr[u])
                        for i in range(100)] for u in test_users]
avg_predictions_jmsd = [[None if train_matrix[u, i] != 0 else average_prediction(u, i, neighbors_jmsd[u])
                        for i in range(100)] for u in test_users]

wavg_predictions_corr = [[None if train_matrix[u, i] != 0 else weighted_average_prediction(u, i, neighbors_corr[u])
                        for i in range(100)] for u in test_users]
wavg_predictions_jmsd = [[None if train_matrix[u, i] != 0 else weighted_average_prediction(u, i, neighbors_jmsd[u])
                        for i in range(100)] for u in test_users]

dfm_predictions_corr = [[None if train_matrix[u, i] != 0 else deviation_from_mean_prediction(u, i, neighbors_corr[u])
                        for i in range(100)] for u in test_users]
dfm_predictions_jmsd = [[None if train_matrix[u, i] != 0 else deviation_from_mean_prediction(u, i, neighbors_jmsd[u])
                        for i in range(100)] for u in test_users]

In [126]:
recommendations_avg_corr = get_recommendations(avg_predictions_corr)
recommendations_avg_jmsd = get_recommendations(avg_predictions_jmsd)
recommendations_wavg_corr = get_recommendations(wavg_predictions_corr)
recommendations_wavg_jmsd = get_recommendations(wavg_predictions_jmsd)
recommendations_dfm_corr = get_recommendations(dfm_predictions_corr)
recommendations_dfm_jmsd = get_recommendations(dfm_predictions_jmsd)

print(recommendations_avg_corr)
print(recommendations_avg_jmsd)
print(recommendations_wavg_corr)
print(recommendations_wavg_jmsd)
print(recommendations_dfm_corr)
print(recommendations_dfm_jmsd)

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 7]
[1, 1, 1, 24, 6]
[4, 42, 1, 92, 96]


#### Calculo del MAE

In [127]:
def has_test_ratings(u):
    return test_matrix[u].nnz > 0

Para poder hacer las predicciones de múltiples algoritmos, vimos necesario hacer un prefiltrado de posibles vecinos en bloques de 50  
para reducir drásticamente el tiempo de ejecución (anteriormente estuvo 1 día y no terminó).  
Actualmente estamos comparando similitudes únicamente con vecinos potenciales. (90mins aprox)

In [134]:
avg_predictions_corr = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
avg_predictions_jmsd = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
wavg_predictions_corr = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
wavg_predictions_jmsd = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
dfm_predictions_corr = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
dfm_predictions_jmsd = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]

# Rellenamos la matriz de predicciones
for u in range(NUM_USERS):
  print(f'{(u/ NUM_USERS * 100):.2f}%')
  if has_test_ratings(u):

    # Calcular similaridades
    similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
    similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]

    # Calcular vecinos
    neighbors_corr = get_neighbors(u, similarities_corr)
    neighbors_jmsd = get_neighbors(u, similarities_jmsd)

    # Calcular predicciones sobre los items de test votados por el usuario
    for i in range(NUM_ITEMS):
      if test_matrix[u, i] != 0:  # Para matrices dispersas CSR
        avg_predictions_corr[u][i] = average_prediction(u, i, neighbors_corr)
        avg_predictions_jmsd[u][i] = average_prediction(u, i, neighbors_jmsd)
        wavg_predictions_corr[u][i] = weighted_average_prediction(u, i, neighbors_corr)
        wavg_predictions_jmsd[u][i] = weighted_average_prediction(u, i, neighbors_jmsd)
        dfm_predictions_corr[u][i] = deviation_from_mean_prediction(u, i, neighbors_corr)
        dfm_predictions_jmsd[u][i] = deviation_from_mean_prediction(u, i, neighbors_jmsd)

0.00%
0.09%
0.18%
0.26%
0.35%
0.44%
0.53%
0.61%
0.70%
0.79%
0.88%
0.96%
1.05%
1.14%
1.23%
1.31%
1.40%
1.49%
1.58%
1.67%
1.75%
1.84%
1.93%
2.02%
2.10%
2.19%
2.28%
2.37%
2.45%
2.54%
2.63%
2.72%
2.80%
2.89%
2.98%
3.07%
3.16%
3.24%
3.33%
3.42%
3.51%
3.59%
3.68%
3.77%
3.86%
3.94%
4.03%
4.12%
4.21%
4.29%
4.38%
4.47%
4.56%
4.65%
4.73%
4.82%
4.91%
5.00%
5.08%
5.17%
5.26%
5.35%
5.43%
5.52%
5.61%
5.70%
5.78%
5.87%
5.96%
6.05%
6.13%
6.22%
6.31%
6.40%
6.49%
6.57%
6.66%
6.75%
6.84%
6.92%
7.01%
7.10%
7.19%
7.27%
7.36%
7.45%
7.54%
7.62%
7.71%
7.80%
7.89%
7.98%
8.06%
8.15%
8.24%
8.33%
8.41%
8.50%
8.59%
8.68%
8.76%
8.85%
8.94%
9.03%
9.11%
9.20%
9.29%
9.38%
9.47%
9.55%
9.64%
9.73%
9.82%
9.90%
9.99%
10.08%
10.17%
10.25%
10.34%
10.43%
10.52%
10.60%
10.69%
10.78%
10.87%
10.96%
11.04%
11.13%
11.22%
11.31%
11.39%
11.48%
11.57%
11.66%
11.74%
11.83%
11.92%
12.01%
12.09%
12.18%
12.27%
12.36%
12.45%
12.53%
12.62%
12.71%
12.80%
12.88%
12.97%
13.06%
13.15%
13.23%
13.32%
13.41%
13.50%
13.58%
13.67%
13.76%
13.85%
13

Ahora definimos el MAE y lo calculamos

In [135]:
def get_user_mae(u, predictions):
    mae = 0
    count = 0

    test_items = test_matrix[u].nonzero()[1]

    for i in test_items:
        if predictions[u][i] is not None:
            mae += abs(test_matrix[u, i] - predictions[u][i])
            count += 1

    if count > 0:
        return mae / count
    else:
        return None


In [136]:
def get_mae(predictions):
    mae = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_mae = get_user_mae(u, predictions)

            if user_mae is not None:
                mae += user_mae
                count += 1

    if count > 0:
        return mae / count
    else:
        return None

In [137]:
mae_avg_corr = get_mae(avg_predictions_corr)
mae_avg_jmsd = get_mae(avg_predictions_jmsd)
mae_wavg_corr = get_mae(wavg_predictions_corr)
mae_wavg_jmsd = get_mae(wavg_predictions_jmsd)
mae_dfm_corr = get_mae(dfm_predictions_corr)
mae_dfm_jmsd = get_mae(dfm_predictions_jmsd)

print(f"MAE Avg Prediction with Corr Similarity = " + str(mae_avg_corr))
print(f"MAE Avg Prediction with JMSD Similarity = " + str(mae_avg_jmsd))
print(f"MAE Weighted Avg Prediction with Corr Similarity = " + str(mae_wavg_corr))
print(f"MAE Weighted Avg Prediction with JMSD Similarity = " + str(mae_wavg_jmsd))
print(f"MAE Dev from Mean Prediction with Corr Similarity = " + str(mae_dfm_corr))
print(f"MAE Dev from Mean Prediction with JMSD Similarity = " + str(mae_dfm_jmsd))

MAE Avg Prediction with Corr Similarity = 0.9253989193716361
MAE Avg Prediction with JMSD Similarity = 0.8891381276586673
MAE Weighted Avg Prediction with Corr Similarity = 1.4951731697457744
MAE Weighted Avg Prediction with JMSD Similarity = 0.8896986542954874
MAE Dev from Mean Prediction with Corr Similarity = 0.906414319183112
MAE Dev from Mean Prediction with JMSD Similarity = 0.8714327656261254


#### Cálculo del resto de medidas
Las adaptaciones para las matrices sparse utilizadas se centran acceso eficiente a datos no nulos mediante `nonzero()`,  
reemplazando verificaciones basadas en `None` por comprobaciones de valores distintos de cero,  
y empleando pre-filtrado de vecinos para reducir drásticamente los pares de comparación

rmse

In [138]:
def get_user_rmse(u, predictions):
    mse = 0
    count = 0

    test_items = test_matrix[u].nonzero()[1]

    for i in test_items:
        if predictions[u][i] is not None:
            mse += (test_matrix[u, i] - predictions[u][i]) ** 2
            count += 1

    if count > 0:
        return math.sqrt(mse / count)
    else:
        return None

In [139]:
def get_rmse(predictions):
    rmse = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_rmse = get_user_rmse(u, predictions)

            if user_rmse is not None:
                rmse += user_rmse
                count += 1

    if count > 0:
        return rmse / count
    else:
        return None

In [140]:
rmse_avg_corr = get_rmse(avg_predictions_corr)
rmse_avg_jmsd = get_rmse(avg_predictions_jmsd)
rmse_wavg_corr = get_rmse(wavg_predictions_corr)
rmse_wavg_jmsd = get_rmse(wavg_predictions_jmsd)
rmse_dfm_corr = get_rmse(dfm_predictions_corr)
rmse_dfm_jmsd = get_rmse(dfm_predictions_jmsd)

print(f"RMSE Avg Prediction with Corr Similarity = " + str(rmse_avg_corr))
print(f"RMSE Avg Prediction with JMSD Similarity = " + str(rmse_avg_jmsd))
print(f"RMSE Weighted Avg Prediction with Corr Similarity = " + str(rmse_wavg_corr))
print(f"RMSE Weighted Avg Prediction with JMSD Similarity = " + str(rmse_wavg_jmsd))
print(f"RMSE Dev from Mean Prediction with Corr Similarity = " + str(rmse_dfm_corr))
print(f"RMSE Dev from Mean Prediction with JMSD Similarity = " + str(rmse_dfm_jmsd))

RMSE Avg Prediction with Corr Similarity = 1.2068751519805172
RMSE Avg Prediction with JMSD Similarity = 1.1863055385249233
RMSE Weighted Avg Prediction with Corr Similarity = 2.085948591788406
RMSE Weighted Avg Prediction with JMSD Similarity = 1.186703353540928
RMSE Dev from Mean Prediction with Corr Similarity = 1.1574648409835016
RMSE Dev from Mean Prediction with JMSD Similarity = 1.118176263338421


Precision

In [141]:
theta = 4

In [142]:
def get_user_precision(u, predictions):
    precision = 0
    count = 0

    user_predictions = predictions[u]
    recommendations = get_recommendations([user_predictions])[0]

    if recommendations is None:
        return None

    if not isinstance(recommendations, list):
        recommendations = [recommendations]

    for i in recommendations:
        if i is not None and test_matrix[u, i] != 0:
            precision += 1 if test_matrix[u, i] >= theta else 0
            count += 1

    if count > 0:
        return precision / count
    else:
        return None

In [143]:
def get_precision(predictions):
    precision = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_precision = get_user_precision(u, predictions)

            if user_precision is not None:
                precision += user_precision
                count += 1

    if count > 0:
        return precision / count
    else:
        return None

In [144]:
precision_avg_corr = get_precision(avg_predictions_corr)
precision_avg_jmsd = get_precision(avg_predictions_jmsd)
precision_wavg_corr = get_precision(wavg_predictions_corr)
precision_wavg_jmsd = get_precision(wavg_predictions_jmsd)
precision_dfm_corr = get_precision(dfm_predictions_corr)
precision_dfm_jmsd = get_precision(dfm_predictions_jmsd)

print(f"Precision Avg Prediction with Corr Similarity = " + str(precision_avg_corr))
print(f"Precision Avg Prediction with JMSD Similarity = " + str(precision_avg_jmsd))
print(f"Precision Weighted Avg Prediction with Corr Similarity = " + str(precision_wavg_corr))
print(f"Precision Weighted Avg Prediction with JMSD Similarity = " + str(precision_wavg_jmsd))
print(f"Precision Dev from Mean Prediction with Corr Similarity = " + str(precision_dfm_corr))
print(f"Precision Dev from Mean Prediction with JMSD Similarity = " + str(precision_dfm_jmsd))

Precision Avg Prediction with Corr Similarity = 0.8366988586479368
Precision Avg Prediction with JMSD Similarity = 0.8419666374012291
Precision Weighted Avg Prediction with Corr Similarity = 0.845478489903424
Precision Weighted Avg Prediction with JMSD Similarity = 0.839332748024583
Precision Dev from Mean Prediction with Corr Similarity = 0.839332748024583
Precision Dev from Mean Prediction with JMSD Similarity = 0.835820895522388


Recall

In [145]:
def get_user_recall(u, predictions):
    recall = 0
    count = 0

    recommendations = get_recommendations([predictions[u]])[0]

    if recommendations is not None and not isinstance(recommendations, list):
        recommendations = [recommendations]

    test_items = test_matrix[u].nonzero()[1]

    for i in test_items:
        if test_matrix[u, i] >= theta:
            recall += 1 if recommendations is not None and i in recommendations else 0
            count += 1

    if count > 0:
        return recall / count
    else:
        return None

def get_recall(predictions):
    recall = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_recall = get_user_recall(u, predictions)

            if user_recall is not None:
                recall += user_recall
                count += 1

    if count > 0:
        return recall / count
    else:
        return None

In [146]:
recall_avg_corr = get_recall(avg_predictions_corr)
recall_avg_jmsd = get_recall(avg_predictions_jmsd)
recall_wavg_corr = get_recall(wavg_predictions_corr)
recall_wavg_jmsd = get_recall(wavg_predictions_jmsd)
recall_dfm_corr = get_recall(dfm_predictions_corr)
recall_dfm_jmsd = get_recall(dfm_predictions_jmsd)

print(f"Recall Avg Prediction with Corr Similarity = " + str(recall_avg_corr))
print(f"Recall Avg Prediction with JMSD Similarity = " + str(recall_avg_jmsd))
print(f"Recall Weighted Avg Prediction with Corr Similarity = " + str(recall_wavg_corr))
print(f"Recall Weighted Avg Prediction with JMSD Similarity = " + str(recall_wavg_jmsd))
print(f"Recall Dev from Mean Prediction with Corr Similarity = " + str(recall_dfm_corr))
print(f"Recall Dev from Mean Prediction with JMSD Similarity = " + str(recall_dfm_jmsd))

Recall Avg Prediction with Corr Similarity = 0.15236202588080855
Recall Avg Prediction with JMSD Similarity = 0.15269808356298617
Recall Weighted Avg Prediction with Corr Similarity = 0.15308404722338936
Recall Weighted Avg Prediction with JMSD Similarity = 0.15239471908502739
Recall Dev from Mean Prediction with Corr Similarity = 0.15316387844142437
Recall Dev from Mean Prediction with JMSD Similarity = 0.15337196490913357


F1

In [147]:
def get_user_f1(u, predictions):
    precision = get_user_precision(u, predictions)
    recall = get_user_recall(u, predictions)

    if precision is None or recall is None:
        return None
    elif precision == 0 and recall == 0:
        return 0
    else:
        return 2 * precision * recall / (precision + recall)

In [148]:
def get_f1(predictions):
    f1 = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_f1 = get_user_f1(u, predictions)

            if user_f1 is not None:
                f1 += user_f1
                count += 1

    if count > 0:
        return f1 / count
    else:
        return None

In [149]:
f1_avg_corr = get_f1(avg_predictions_corr)
f1_avg_jmsd = get_f1(avg_predictions_jmsd)
f1_wavg_corr = get_f1(wavg_predictions_corr)
f1_wavg_jmsd = get_f1(wavg_predictions_jmsd)
f1_dfm_corr = get_f1(dfm_predictions_corr)
f1_dfm_jmsd = get_f1(dfm_predictions_jmsd)

print(f"F1 Avg Prediction with Corr Similarity = " + str(f1_avg_corr))
print(f"F1 Avg Prediction with JMSD Similarity = " + str(f1_avg_jmsd))
print(f"F1 Weighted Avg Prediction with Corr Similarity = " + str(f1_wavg_corr))
print(f"F1 Weighted Avg Prediction with JMSD Similarity = " + str(f1_wavg_jmsd))
print(f"F1 Dev from Mean Prediction with Corr Similarity = " + str(f1_dfm_corr))
print(f"F1 Dev from Mean Prediction with JMSD Similarity = " + str(f1_dfm_jmsd))

F1 Avg Prediction with Corr Similarity = 0.24416223386263353
F1 Avg Prediction with JMSD Similarity = 0.24520824339893638
F1 Weighted Avg Prediction with Corr Similarity = 0.2460613954864842
F1 Weighted Avg Prediction with JMSD Similarity = 0.24472866824270453
F1 Dev from Mean Prediction with Corr Similarity = 0.2450653731201427
F1 Dev from Mean Prediction with JMSD Similarity = 0.2453404028500156


nDCG

In [150]:
def get_sorted_test_items(u):
    test_items = test_matrix[u].nonzero()[1]
    items = []

    if len(test_items) == 0:
        return items

    for i in test_items:
        items.append(i)

    items.sort(key=lambda x: test_matrix[u, x], reverse=True)

    return items

In [151]:
def get_user_idcg(u):
    items = get_sorted_test_items(u)
    idcg = 0

    for pos, i in enumerate(items):
        if test_matrix[u, i] >= theta:
            idcg += (2 ** test_matrix[u, i] - 1) / math.log2(pos + 2)

    return idcg

In [152]:
def get_user_dcg(u, recommendations):
    dcg = 0

    if recommendations is None:
        return 0

    if not isinstance(recommendations, list):
        recommendations = [recommendations]

    for pos, i in enumerate(recommendations):
        if i is not None and test_matrix[u, i] != 0:
            dcg += (2 ** test_matrix[u, i] - 1) / math.log2(pos + 2)

    return dcg

In [153]:
def get_user_ndcg(u, predictions):
    recommendations = get_recommendations([predictions[u]])[0]

    dcg = get_user_dcg(u, recommendations)
    idcg = get_user_idcg(u)

    if idcg == 0:
        return None
    else:
        return dcg / idcg

In [154]:
def get_ndcg(predictions):
    ndcg = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_ndcg = get_user_ndcg(u, predictions)

            if user_ndcg is not None:
                ndcg += user_ndcg
                count += 1

    if count > 0:
        return ndcg / count
    else:
        return None

In [155]:
ndcg_avg_corr = get_ndcg(avg_predictions_corr)
ndcg_avg_jmsd = get_ndcg(avg_predictions_jmsd)
ndcg_wavg_corr = get_ndcg(wavg_predictions_corr)
ndcg_wavg_jmsd = get_ndcg(wavg_predictions_jmsd)
ndcg_dfm_corr = get_ndcg(dfm_predictions_corr)
ndcg_dfm_jmsd = get_ndcg(dfm_predictions_jmsd)

print(f"nDCG Avg Prediction with Corr Similarity = " + str(ndcg_avg_corr))
print(f"nDCG Avg Prediction with JMSD Similarity = " + str(ndcg_avg_jmsd))
print(f"nDCG Weighted Avg Prediction with Corr Similarity = " + str(ndcg_wavg_corr))
print(f"nDCG Weighted Avg Prediction with JMSD Similarity = " + str(ndcg_wavg_jmsd))
print(f"nDCG Dev from Mean Prediction with Corr Similarity = " + str(ndcg_dfm_corr))
print(f"nDCG Dev from Mean Prediction with JMSD Similarity = " + str(ndcg_dfm_jmsd))

nDCG Avg Prediction with Corr Similarity = 0.26908334391304933
nDCG Avg Prediction with JMSD Similarity = 0.2692377818839883
nDCG Weighted Avg Prediction with Corr Similarity = 0.2698495886034852
nDCG Weighted Avg Prediction with JMSD Similarity = 0.26909344337642344
nDCG Dev from Mean Prediction with Corr Similarity = 0.2675613909560734
nDCG Dev from Mean Prediction with JMSD Similarity = 0.26945522485491585
