#### Cargamos las librerias necesarias

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from datetime import datetime
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
import random
from scipy.stats import pearsonr
from collections import Counter
import collections

#### Cargamos el archivo

In [None]:
from google.colab import files
uploaded = files.upload()  # Te mostrará un botón para seleccionar el archivo Video_Games.csv

Saving Video_Games.csv to Video_Games.csv


In [None]:
df = pd.read_csv("Video_Games.csv", header=None)

df.columns = ['item_id', 'user_id', 'rating', 'timestamp']

df = df.drop('timestamp', axis=1)

print(f"Tamaño total del dataset: {df.shape}")

Tamaño total del dataset: (2565349, 3)


El método de knn es el que más problemas nos ha dado en cuanto a tiempos de ejecución por tanto vamos a filtrarlo a usuarios que tengan minimo 30 ratings llegnado a un total de 10000 usuarios como máximo

In [None]:
top_n_users = 10000 # Maximo de usuarios con los que queremos trabajar
min_ratings = 30   # Usuarios con al menos este número de ratings
user_counts = df['user_id'].value_counts()

dense_users = user_counts[user_counts >= min_ratings].index.tolist()

if len(dense_users) > top_n_users:
    dense_users = dense_users[:top_n_users]

print(f"Seleccionados {len(dense_users)} usuarios con al menos {min_ratings} ratings cada uno")

df_dense = df[df['user_id'].isin(dense_users)]

print(f"Tamaño del dataset filtrado: {df_dense.shape}")

Seleccionados 1579 usuarios con al menos 30 ratings cada uno
Tamaño del dataset filtrado: (84541, 3)


Hacemos una función que garantice la presencia tanto de usuarios como de items de test al menos una vez en train

In [None]:
def random_split(data, test_ratio=0.2, random_state=42):
    data_shuffled = data.sample(frac=1, random_state=random_state)

    split = int(len(data) * (1 - test_ratio))

    train = data_shuffled.iloc[:split]
    test = data_shuffled.iloc[split:]

    train_users = set(train['user_id'].unique())
    train_items = set(train['item_id'].unique())
    test = test[test['user_id'].isin(train_users) & test['item_id'].isin(train_items)]

    return train, test

train_data, test_data = random_split(df_dense)
print(f"Tamaño del conjunto de entrenamiento: {train_data.shape}")
print(f"Tamaño del conjunto de prueba: {test_data.shape}")

train_users = set(train_data['user_id'].unique())
train_items = set(train_data['item_id'].unique())
common_users = len(train_users.intersection(set(test_data['user_id'].unique())))
common_items = len(train_items.intersection(set(test_data['item_id'].unique())))

Tamaño del conjunto de entrenamiento: (67632, 3)
Tamaño del conjunto de prueba: (14478, 3)


#### Creación de matrices

Mapeos de IDs para no perder informacion

In [None]:
train_users_list = train_data['user_id'].unique()
train_items_list = train_data['item_id'].unique()

user2idx = {user_id: idx for idx, user_id in enumerate(train_users_list)}
item2idx = {item_id: idx for idx, item_id in enumerate(train_items_list)}

idx2user = {idx: user for user, idx in user2idx.items()}
idx2item = {idx: item for item, idx in item2idx.items()}

NUM_USERS = len(user2idx)
NUM_ITEMS = len(item2idx)
MIN_RATING = df['rating'].min()
MAX_RATING = df['rating'].max()

print(f"Usuarios: {NUM_USERS}, Items: {NUM_ITEMS}")

Usuarios: 1579, Items: 20340


Matriz de train sparse

In [None]:
train_matrix = sp.lil_matrix((NUM_USERS, NUM_ITEMS))

for _, row in train_data.iterrows():
    if row['user_id'] in user2idx and row['item_id'] in item2idx:
        u_idx = user2idx[row['user_id']]
        i_idx = item2idx[row['item_id']]
        train_matrix[u_idx, i_idx] = row['rating']

# Convertimos la matriz a su formato CSR para mejorar en eficiencia
train_matrix = train_matrix.tocsr()

Matriz de test sparse

In [None]:
test_matrix = sp.lil_matrix((NUM_USERS, NUM_ITEMS))

for _, row in test_data.iterrows():
    if row['user_id'] in user2idx and row['item_id'] in item2idx:
        u_idx = user2idx[row['user_id']]
        i_idx = item2idx[row['item_id']]
        test_matrix[u_idx, i_idx] = row['rating']

#El mismo motivo que antes
test_matrix = test_matrix.tocsr()

#### Cáclulo de similaridad

Adaptamos las funciones de similaridad implementadas en clase a nuestro caso

Correlation similarity

In [None]:
def correlation_similarity(u, v):
    if u == v:
        return sp.identity(NUM_USERS, format='csr')

    u_items = set(train_matrix[u].indices)
    v_items = set(train_matrix[v].indices)
    common_items = u_items.intersection(v_items)

    if len(common_items) < 2:
        return sp.csr_matrix((1, NUM_USERS))

    u_ratings = [train_matrix[u, i] for i in common_items]
    v_ratings = [train_matrix[v, i] for i in common_items]

    avg_u = np.mean(u_ratings) if len(u_ratings) > 0 else 0
    avg_v = np.mean(v_ratings) if len(v_ratings) > 0 else 0

    num = 0
    den_u = 0
    den_v = 0

    for i in common_items:
        r_u = float(train_matrix[u, i])
        r_v = float(train_matrix[v, i])

        num += (r_u - avg_u) * (r_v - avg_v)
        den_u += (r_u - avg_u) ** 2
        den_v += (r_v - avg_v) ** 2

    if num == 0 or den_u == 0 or den_v == 0:
        return sp.csr_matrix((1, NUM_USERS))

    sim = num / (np.sqrt(den_u) * np.sqrt(den_v))

    return sp.csr_matrix(([sim], ([0], [v])), shape=(1, NUM_USERS))

jmsd similarity

In [None]:
def jmsd_similarity(u, v):
    if u == v:
        return sp.identity(NUM_USERS, format='csr')

    u_items = set(train_matrix[u].indices)
    v_items = set(train_matrix[v].indices)
    common_items = u_items.intersection(v_items)
    all_items = u_items.union(v_items)

    intersection = len(common_items)
    union = len(all_items)

    if intersection == 0:
        return sp.csr_matrix((1, NUM_USERS))

    rating_range = MAX_RATING - MIN_RATING

    diff_sum = 0
    for i in common_items:
        # Normalizamos las calificaciones
        r_u = (float(train_matrix[u, i]) - MIN_RATING) / rating_range
        r_v = (float(train_matrix[v, i]) - MIN_RATING) / rating_range

        diff_sum += (r_u - r_v) ** 2

    msd = diff_sum / intersection
    jaccard = intersection / union

    sim = jaccard * (1 - msd)

    return sp.csr_matrix(([sim], ([0], [v])), shape=(1, NUM_USERS))

Hacemos un pequeño test para comprobar que funciona.  
En la mayoria de casos da 0.0, aunque de vez en cuando encuentra usuarios con algo de similaridad.  
Esto ocurre porque nuestro dataset es bastante disperso

In [None]:
u, v = random.randint(0, NUM_USERS-1), random.randint(0, NUM_USERS-1)
corr_sim = correlation_similarity(u, v)[0, v] if u != v else 1.0
jmsd_sim = jmsd_similarity(u, v)[0, v] if u != v else 1.0
print(f"Similitud entre usuario {idx2user[u]} y {idx2user[v]}: Correlación = {corr_sim:.6f}, JMSD = {jmsd_sim:.6f}")

Similitud entre usuario A3HLAESZGUFBAV y AREYISJYIH9L3: Correlación = 0.000000, JMSD = 0.037162


#### Obtener vecinos

In [None]:
def get_neighbors(u, similarities):
    k = 25
    neighbors = []

    for v, sim in enumerate(similarities):
        if v != u and sim is not None:
            if sim.nnz > 0:
                sim_value = sim[0, v]
                neighbors.append((v, sim_value))

    neighbors.sort(key=lambda x: x[1], reverse=True)
    return neighbors[:k]

In [None]:
u = 12
similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]
neighbors_corr = get_neighbors(u, similarities_corr)
neighbors_jmsd = get_neighbors(u, similarities_jmsd)
print(neighbors_corr)
print(neighbors_jmsd)

[(302, np.float64(1.0)), (523, np.float64(1.0)), (1438, np.float64(1.0)), (1299, np.float64(0.9999999999999998)), (802, np.float64(-0.30151134457776363)), (536, np.float64(-0.9999999999999998))]
[(1366, np.float64(0.14893617021276595)), (1085, np.float64(0.07407407407407407)), (1564, np.float64(0.07352941176470588)), (728, np.float64(0.06976744186046512)), (295, np.float64(0.057692307692307696)), (824, np.float64(0.05263157894736842)), (1438, np.float64(0.04662698412698412)), (345, np.float64(0.045454545454545456)), (981, np.float64(0.044444444444444446)), (1556, np.float64(0.043478260869565216)), (778, np.float64(0.04196428571428571)), (304, np.float64(0.041666666666666664)), (798, np.float64(0.041666666666666664)), (57, np.float64(0.04)), (802, np.float64(0.036989795918367346)), (560, np.float64(0.03529411764705882)), (14, np.float64(0.03508771929824561)), (1027, np.float64(0.034482758620689655)), (326, np.float64(0.03409090909090909)), (458, np.float64(0.03076923076923077)), (52, np

#### Estimación de las predicciones
En el caso de que no haya los suficientes datos, se devolverá la media global del item. Es un término conocido como 'fallback' que se usa en casos reales

Media del item

In [None]:
def global_average(i):
    users_who_rated = [u for u in range(NUM_USERS) if train_matrix[u, i] != 0]
    if users_who_rated:
        return sum(train_matrix[u, i] for u in users_who_rated) / len(users_who_rated)
    return 3.0

Media

In [None]:
def average_prediction(u, i, neighbors):
    count = 0
    sum_ratings = 0

    for v, _ in neighbors:
        if train_matrix[v, i] != 0:
            sum_ratings += train_matrix[v, i]
            count += 1

    if count > 0:
        return sum_ratings / count

    return global_average(i)

In [None]:
u = 1
i = 14
predicted_corr = average_prediction(u, i, neighbors_corr)
predicted_jmsd = average_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

3.6666666666666665
5.0


Media ponderada

In [None]:
def weighted_average_prediction(u, i, neighbors):
    numerator = 0
    denominator = 0

    for v, sim in neighbors:
        if train_matrix[v, i] != 0:
            numerator += sim * train_matrix[v, i]
            denominator += abs(sim)

    if denominator > 0:
        return numerator / denominator

    return global_average(i)

In [None]:
u = 1
i = 26
predicted_corr = weighted_average_prediction(u, i, neighbors_corr)
predicted_jmsd = weighted_average_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

4.2
5.0


Media de agregación

In [None]:
def deviation_from_mean_prediction(u, i, neighbors):
    u_ratings = train_matrix[u].data
    avg_u = np.mean(u_ratings) if len(u_ratings) > 0 else global_average(i)

    sum_deviation = 0
    count = 0

    for v, _ in neighbors:
        if train_matrix[v, i] != 0:
            v_ratings = train_matrix[v].data
            avg_v = np.mean(v_ratings) if len(v_ratings) > 0 else 0

            sum_deviation += (train_matrix[v, i] - avg_v)
            count += 1

    if count > 0:
        prediction = avg_u + (sum_deviation / count)
        return prediction

    return global_average(i)

In [None]:
u = 1
i = 32
predicted_corr = deviation_from_mean_prediction(u, i, neighbors_corr)
predicted_jmsd = deviation_from_mean_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

4.44
4.854804448331876


#### Calculo de recomendaciones

In [None]:
def get_recommendations(predictions):
    recommendations = [None for _ in range(len(predictions))]

    for n in range(len(predictions)):
        max_value = None
        item = None

        for i, value in enumerate(predictions[n]):
            if value is not None and (max_value is None or value > max_value):
                max_value = value
                item = i

        recommendations[n] = item

    return recommendations

In [None]:
N = 5 #Vamos a usar 5 usuarios para mostrar recomendaciones
M = NUM_ITEMS

test_users = random.sample(range(NUM_USERS), N)

neighbors_corr = {}
neighbors_jmsd = {}
for u in test_users:
    similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
    similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]
    neighbors_corr[u] = get_neighbors(u, similarities_corr)
    neighbors_jmsd[u] = get_neighbors(u, similarities_jmsd)

Decidimos que haga la recomendación con 100 items ya que lo considerabamos una muestra aceptable. Si no nos resultaba inviable con todos (20.000).

In [None]:
avg_predictions_corr = [[None if train_matrix[u, i] != 0 else average_prediction(u, i, neighbors_corr[u])
                        for i in range(100)] for u in test_users]
avg_predictions_jmsd = [[None if train_matrix[u, i] != 0 else average_prediction(u, i, neighbors_jmsd[u])
                        for i in range(100)] for u in test_users]

wavg_predictions_corr = [[None if train_matrix[u, i] != 0 else weighted_average_prediction(u, i, neighbors_corr[u])
                        for i in range(100)] for u in test_users]
wavg_predictions_jmsd = [[None if train_matrix[u, i] != 0 else weighted_average_prediction(u, i, neighbors_jmsd[u])
                        for i in range(100)] for u in test_users]

dfm_predictions_corr = [[None if train_matrix[u, i] != 0 else deviation_from_mean_prediction(u, i, neighbors_corr[u])
                        for i in range(100)] for u in test_users]
dfm_predictions_jmsd = [[None if train_matrix[u, i] != 0 else deviation_from_mean_prediction(u, i, neighbors_jmsd[u])
                        for i in range(100)] for u in test_users]

In [None]:
recommendations_avg_corr = get_recommendations(avg_predictions_corr)
recommendations_avg_jmsd = get_recommendations(avg_predictions_jmsd)
recommendations_wavg_corr = get_recommendations(wavg_predictions_corr)
recommendations_wavg_jmsd = get_recommendations(wavg_predictions_jmsd)
recommendations_dfm_corr = get_recommendations(dfm_predictions_corr)
recommendations_dfm_jmsd = get_recommendations(dfm_predictions_jmsd)

print(recommendations_avg_corr)
print(recommendations_avg_jmsd)
print(recommendations_wavg_corr)
print(recommendations_wavg_jmsd)
print(recommendations_dfm_corr)
print(recommendations_dfm_jmsd)

[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 0, 28, 0, 55]
[0, 54, 25, 86, 90]
[0, 41, 0, 10, 90]


#### Calculo del MAE

In [None]:
def has_test_ratings(u):
    return test_matrix[u].nnz > 0

In [None]:
avg_predictions_corr = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
avg_predictions_jmsd = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
wavg_predictions_corr = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
wavg_predictions_jmsd = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
dfm_predictions_corr = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
dfm_predictions_jmsd = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]

for u in range(NUM_USERS):
  print(f'{(u/ NUM_USERS * 100):.2f}%')
  if has_test_ratings(u):

    similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
    similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]

    neighbors_corr = get_neighbors(u, similarities_corr)
    neighbors_jmsd = get_neighbors(u, similarities_jmsd)

    for i in range(NUM_ITEMS):
      if test_matrix[u, i] != 0:
        avg_predictions_corr[u][i] = average_prediction(u, i, neighbors_corr)
        avg_predictions_jmsd[u][i] = average_prediction(u, i, neighbors_jmsd)
        wavg_predictions_corr[u][i] = weighted_average_prediction(u, i, neighbors_corr)
        wavg_predictions_jmsd[u][i] = weighted_average_prediction(u, i, neighbors_jmsd)
        dfm_predictions_corr[u][i] = deviation_from_mean_prediction(u, i, neighbors_corr)
        dfm_predictions_jmsd[u][i] = deviation_from_mean_prediction(u, i, neighbors_jmsd)

0.00%
0.06%
0.13%
0.19%
0.25%
0.32%
0.38%
0.44%
0.51%
0.57%
0.63%
0.70%
0.76%
0.82%
0.89%
0.95%
1.01%
1.08%
1.14%
1.20%
1.27%
1.33%
1.39%
1.46%
1.52%
1.58%
1.65%
1.71%
1.77%
1.84%
1.90%
1.96%
2.03%
2.09%
2.15%
2.22%
2.28%
2.34%
2.41%
2.47%
2.53%
2.60%
2.66%
2.72%
2.79%
2.85%
2.91%
2.98%
3.04%
3.10%
3.17%
3.23%
3.29%
3.36%
3.42%
3.48%
3.55%
3.61%
3.67%
3.74%
3.80%
3.86%
3.93%
3.99%
4.05%
4.12%
4.18%
4.24%
4.31%
4.37%
4.43%
4.50%
4.56%
4.62%
4.69%
4.75%
4.81%
4.88%
4.94%
5.00%
5.07%
5.13%
5.19%
5.26%
5.32%
5.38%
5.45%
5.51%
5.57%
5.64%
5.70%
5.76%
5.83%
5.89%
5.95%
6.02%
6.08%
6.14%
6.21%
6.27%
6.33%
6.40%
6.46%
6.52%
6.59%
6.65%
6.71%
6.78%
6.84%
6.90%
6.97%
7.03%
7.09%
7.16%
7.22%
7.28%
7.35%
7.41%
7.47%
7.54%
7.60%
7.66%
7.73%
7.79%
7.85%
7.92%
7.98%
8.04%
8.11%
8.17%
8.23%
8.30%
8.36%
8.42%
8.49%
8.55%
8.61%
8.68%
8.74%
8.80%
8.87%
8.93%
8.99%
9.06%
9.12%
9.18%
9.25%
9.31%
9.37%
9.44%
9.50%
9.56%
9.63%
9.69%
9.75%
9.82%
9.88%
9.94%
10.01%
10.07%
10.13%
10.20%
10.26%
10.32%
10.39%
10.

Ahora definimos el MAE y lo calculamos

In [None]:
def get_user_mae(u, predictions):
    mae = 0
    count = 0

    test_items = test_matrix[u].nonzero()[1]

    for i in test_items:
        if predictions[u][i] is not None:
            mae += abs(test_matrix[u, i] - predictions[u][i])
            count += 1

    if count > 0:
        return mae / count
    else:
        return None


In [None]:
def get_mae(predictions):
    mae = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_mae = get_user_mae(u, predictions)

            if user_mae is not None:
                mae += user_mae
                count += 1

    if count > 0:
        return mae / count
    else:
        return None

In [None]:
mae_avg_corr = get_mae(avg_predictions_corr)
mae_avg_jmsd = get_mae(avg_predictions_jmsd)
mae_wavg_corr = get_mae(wavg_predictions_corr)
mae_wavg_jmsd = get_mae(wavg_predictions_jmsd)
mae_dfm_corr = get_mae(dfm_predictions_corr)
mae_dfm_jmsd = get_mae(dfm_predictions_jmsd)

print(f"MAE Avg Prediction with Corr Similarity = " + str(mae_avg_corr))
print(f"MAE Avg Prediction with JMSD Similarity = " + str(mae_avg_jmsd))
print(f"MAE Weighted Avg Prediction with Corr Similarity = " + str(mae_wavg_corr))
print(f"MAE Weighted Avg Prediction with JMSD Similarity = " + str(mae_wavg_jmsd))
print(f"MAE Dev from Mean Prediction with Corr Similarity = " + str(mae_dfm_corr))
print(f"MAE Dev from Mean Prediction with JMSD Similarity = " + str(mae_dfm_jmsd))

MAE Avg Prediction with Corr Similarity = 0.9253989193716361
MAE Avg Prediction with JMSD Similarity = 0.8891381276586673
MAE Weighted Avg Prediction with Corr Similarity = 1.4951731697457744
MAE Weighted Avg Prediction with JMSD Similarity = 0.8896986542954874
MAE Dev from Mean Prediction with Corr Similarity = 0.906414319183112
MAE Dev from Mean Prediction with JMSD Similarity = 0.8714327656261254


Podemos ver que las predicciones que se hicieron con jmsd se han comportado mejor. Además, solo una predicción se alejó más de 1 punto

#### Cálculo del resto de medidas
Las adaptaciones para las matrices sparse utilizadas se centran acceso eficiente a datos no nulos mediante `nonzero()`,  
reemplazando verificaciones basadas en `None` por comprobaciones de valores distintos de cero,  
y empleando pre-filtrado de vecinos para reducir drásticamente los pares de comparación

rmse

In [None]:
def get_user_rmse(u, predictions):
    mse = 0
    count = 0

    test_items = test_matrix[u].nonzero()[1]

    for i in test_items:
        if predictions[u][i] is not None:
            mse += (test_matrix[u, i] - predictions[u][i]) ** 2
            count += 1

    if count > 0:
        return math.sqrt(mse / count)
    else:
        return None

In [None]:
def get_rmse(predictions):
    rmse = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_rmse = get_user_rmse(u, predictions)

            if user_rmse is not None:
                rmse += user_rmse
                count += 1

    if count > 0:
        return rmse / count
    else:
        return None

In [None]:
rmse_avg_corr = get_rmse(avg_predictions_corr)
rmse_avg_jmsd = get_rmse(avg_predictions_jmsd)
rmse_wavg_corr = get_rmse(wavg_predictions_corr)
rmse_wavg_jmsd = get_rmse(wavg_predictions_jmsd)
rmse_dfm_corr = get_rmse(dfm_predictions_corr)
rmse_dfm_jmsd = get_rmse(dfm_predictions_jmsd)

print(f"RMSE Avg Prediction with Corr Similarity = " + str(rmse_avg_corr))
print(f"RMSE Avg Prediction with JMSD Similarity = " + str(rmse_avg_jmsd))
print(f"RMSE Weighted Avg Prediction with Corr Similarity = " + str(rmse_wavg_corr))
print(f"RMSE Weighted Avg Prediction with JMSD Similarity = " + str(rmse_wavg_jmsd))
print(f"RMSE Dev from Mean Prediction with Corr Similarity = " + str(rmse_dfm_corr))
print(f"RMSE Dev from Mean Prediction with JMSD Similarity = " + str(rmse_dfm_jmsd))

RMSE Avg Prediction with Corr Similarity = 1.2068751519805172
RMSE Avg Prediction with JMSD Similarity = 1.1863055385249233
RMSE Weighted Avg Prediction with Corr Similarity = 2.085948591788406
RMSE Weighted Avg Prediction with JMSD Similarity = 1.186703353540928
RMSE Dev from Mean Prediction with Corr Similarity = 1.1574648409835016
RMSE Dev from Mean Prediction with JMSD Similarity = 1.118176263338421


Precision

In [None]:
theta = 4

In [None]:
def get_user_precision(u, predictions):
    precision = 0
    count = 0

    user_predictions = predictions[u]
    recommendations = get_recommendations([user_predictions])[0]

    if recommendations is None:
        return None

    if not isinstance(recommendations, list):
        recommendations = [recommendations]

    for i in recommendations:
        if i is not None and test_matrix[u, i] != 0:
            precision += 1 if test_matrix[u, i] >= theta else 0
            count += 1

    if count > 0:
        return precision / count
    else:
        return None

In [None]:
def get_precision(predictions):
    precision = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_precision = get_user_precision(u, predictions)

            if user_precision is not None:
                precision += user_precision
                count += 1

    if count > 0:
        return precision / count
    else:
        return None

In [None]:
precision_avg_corr = get_precision(avg_predictions_corr)
precision_avg_jmsd = get_precision(avg_predictions_jmsd)
precision_wavg_corr = get_precision(wavg_predictions_corr)
precision_wavg_jmsd = get_precision(wavg_predictions_jmsd)
precision_dfm_corr = get_precision(dfm_predictions_corr)
precision_dfm_jmsd = get_precision(dfm_predictions_jmsd)

print(f"Precision Avg Prediction with Corr Similarity = " + str(precision_avg_corr))
print(f"Precision Avg Prediction with JMSD Similarity = " + str(precision_avg_jmsd))
print(f"Precision Weighted Avg Prediction with Corr Similarity = " + str(precision_wavg_corr))
print(f"Precision Weighted Avg Prediction with JMSD Similarity = " + str(precision_wavg_jmsd))
print(f"Precision Dev from Mean Prediction with Corr Similarity = " + str(precision_dfm_corr))
print(f"Precision Dev from Mean Prediction with JMSD Similarity = " + str(precision_dfm_jmsd))

Precision Avg Prediction with Corr Similarity = 0.8366988586479368
Precision Avg Prediction with JMSD Similarity = 0.8419666374012291
Precision Weighted Avg Prediction with Corr Similarity = 0.845478489903424
Precision Weighted Avg Prediction with JMSD Similarity = 0.839332748024583
Precision Dev from Mean Prediction with Corr Similarity = 0.839332748024583
Precision Dev from Mean Prediction with JMSD Similarity = 0.835820895522388


Recall

In [None]:
def get_user_recall(u, predictions):
    recall = 0
    count = 0

    recommendations = get_recommendations([predictions[u]])[0]

    if recommendations is not None and not isinstance(recommendations, list):
        recommendations = [recommendations]

    test_items = test_matrix[u].nonzero()[1]

    for i in test_items:
        if test_matrix[u, i] >= theta:
            recall += 1 if recommendations is not None and i in recommendations else 0
            count += 1

    if count > 0:
        return recall / count
    else:
        return None

def get_recall(predictions):
    recall = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_recall = get_user_recall(u, predictions)

            if user_recall is not None:
                recall += user_recall
                count += 1

    if count > 0:
        return recall / count
    else:
        return None

In [None]:
recall_avg_corr = get_recall(avg_predictions_corr)
recall_avg_jmsd = get_recall(avg_predictions_jmsd)
recall_wavg_corr = get_recall(wavg_predictions_corr)
recall_wavg_jmsd = get_recall(wavg_predictions_jmsd)
recall_dfm_corr = get_recall(dfm_predictions_corr)
recall_dfm_jmsd = get_recall(dfm_predictions_jmsd)

print(f"Recall Avg Prediction with Corr Similarity = " + str(recall_avg_corr))
print(f"Recall Avg Prediction with JMSD Similarity = " + str(recall_avg_jmsd))
print(f"Recall Weighted Avg Prediction with Corr Similarity = " + str(recall_wavg_corr))
print(f"Recall Weighted Avg Prediction with JMSD Similarity = " + str(recall_wavg_jmsd))
print(f"Recall Dev from Mean Prediction with Corr Similarity = " + str(recall_dfm_corr))
print(f"Recall Dev from Mean Prediction with JMSD Similarity = " + str(recall_dfm_jmsd))

Recall Avg Prediction with Corr Similarity = 0.15236202588080855
Recall Avg Prediction with JMSD Similarity = 0.15269808356298617
Recall Weighted Avg Prediction with Corr Similarity = 0.15308404722338936
Recall Weighted Avg Prediction with JMSD Similarity = 0.15239471908502739
Recall Dev from Mean Prediction with Corr Similarity = 0.15316387844142437
Recall Dev from Mean Prediction with JMSD Similarity = 0.15337196490913357


F1

In [None]:
def get_user_f1(u, predictions):
    precision = get_user_precision(u, predictions)
    recall = get_user_recall(u, predictions)

    if precision is None or recall is None:
        return None
    elif precision == 0 and recall == 0:
        return 0
    else:
        return 2 * precision * recall / (precision + recall)

In [None]:
def get_f1(predictions):
    f1 = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_f1 = get_user_f1(u, predictions)

            if user_f1 is not None:
                f1 += user_f1
                count += 1

    if count > 0:
        return f1 / count
    else:
        return None

In [None]:
f1_avg_corr = get_f1(avg_predictions_corr)
f1_avg_jmsd = get_f1(avg_predictions_jmsd)
f1_wavg_corr = get_f1(wavg_predictions_corr)
f1_wavg_jmsd = get_f1(wavg_predictions_jmsd)
f1_dfm_corr = get_f1(dfm_predictions_corr)
f1_dfm_jmsd = get_f1(dfm_predictions_jmsd)

print(f"F1 Avg Prediction with Corr Similarity = " + str(f1_avg_corr))
print(f"F1 Avg Prediction with JMSD Similarity = " + str(f1_avg_jmsd))
print(f"F1 Weighted Avg Prediction with Corr Similarity = " + str(f1_wavg_corr))
print(f"F1 Weighted Avg Prediction with JMSD Similarity = " + str(f1_wavg_jmsd))
print(f"F1 Dev from Mean Prediction with Corr Similarity = " + str(f1_dfm_corr))
print(f"F1 Dev from Mean Prediction with JMSD Similarity = " + str(f1_dfm_jmsd))

F1 Avg Prediction with Corr Similarity = 0.24416223386263353
F1 Avg Prediction with JMSD Similarity = 0.24520824339893638
F1 Weighted Avg Prediction with Corr Similarity = 0.2460613954864842
F1 Weighted Avg Prediction with JMSD Similarity = 0.24472866824270453
F1 Dev from Mean Prediction with Corr Similarity = 0.2450653731201427
F1 Dev from Mean Prediction with JMSD Similarity = 0.2453404028500156


nDCG

In [None]:
def get_sorted_test_items(u):
    test_items = test_matrix[u].nonzero()[1]
    items = []

    if len(test_items) == 0:
        return items

    for i in test_items:
        items.append(i)

    items.sort(key=lambda x: test_matrix[u, x], reverse=True)

    return items

In [None]:
def get_user_idcg(u):
    items = get_sorted_test_items(u)
    idcg = 0

    for pos, i in enumerate(items):
        if test_matrix[u, i] >= theta:
            idcg += (2 ** test_matrix[u, i] - 1) / math.log2(pos + 2)

    return idcg

In [None]:
def get_user_dcg(u, recommendations):
    dcg = 0

    if recommendations is None:
        return 0

    if not isinstance(recommendations, list):
        recommendations = [recommendations]

    for pos, i in enumerate(recommendations):
        if i is not None and test_matrix[u, i] != 0:
            dcg += (2 ** test_matrix[u, i] - 1) / math.log2(pos + 2)

    return dcg

In [None]:
def get_user_ndcg(u, predictions):
    recommendations = get_recommendations([predictions[u]])[0]

    dcg = get_user_dcg(u, recommendations)
    idcg = get_user_idcg(u)

    if idcg == 0:
        return None
    else:
        return dcg / idcg

In [None]:
def get_ndcg(predictions):
    ndcg = 0
    count = 0

    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_ndcg = get_user_ndcg(u, predictions)

            if user_ndcg is not None:
                ndcg += user_ndcg
                count += 1

    if count > 0:
        return ndcg / count
    else:
        return None

In [None]:
ndcg_avg_corr = get_ndcg(avg_predictions_corr)
ndcg_avg_jmsd = get_ndcg(avg_predictions_jmsd)
ndcg_wavg_corr = get_ndcg(wavg_predictions_corr)
ndcg_wavg_jmsd = get_ndcg(wavg_predictions_jmsd)
ndcg_dfm_corr = get_ndcg(dfm_predictions_corr)
ndcg_dfm_jmsd = get_ndcg(dfm_predictions_jmsd)

print(f"nDCG Avg Prediction with Corr Similarity = " + str(ndcg_avg_corr))
print(f"nDCG Avg Prediction with JMSD Similarity = " + str(ndcg_avg_jmsd))
print(f"nDCG Weighted Avg Prediction with Corr Similarity = " + str(ndcg_wavg_corr))
print(f"nDCG Weighted Avg Prediction with JMSD Similarity = " + str(ndcg_wavg_jmsd))
print(f"nDCG Dev from Mean Prediction with Corr Similarity = " + str(ndcg_dfm_corr))
print(f"nDCG Dev from Mean Prediction with JMSD Similarity = " + str(ndcg_dfm_jmsd))

nDCG Avg Prediction with Corr Similarity = 0.26908334391304933
nDCG Avg Prediction with JMSD Similarity = 0.2692377818839883
nDCG Weighted Avg Prediction with Corr Similarity = 0.2698495886034852
nDCG Weighted Avg Prediction with JMSD Similarity = 0.26909344337642344
nDCG Dev from Mean Prediction with Corr Similarity = 0.2675613909560734
nDCG Dev from Mean Prediction with JMSD Similarity = 0.26945522485491585
