#### Carga librerias

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from datetime import datetime
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
import random
from scipy.stats import pearsonr
from collections import Counter
import collections

#### Carga archivo

In [3]:
df = pd.read_csv("./Video_Games.csv", header=None)

df.columns = ['item_id', 'user_id', 'rating', 'timestamp']

df = df.drop('timestamp', axis=1)

print(f"Tamaño total del dataset: {df.shape}")

Tamaño total del dataset: (2565349, 3)


In [4]:
top_n_users = 100000 # Maximo de usuarios con los que queremos trabajar
min_ratings = 10   # Usuarios con al menos este número de ratings
user_counts = df['user_id'].value_counts()

dense_users = user_counts[user_counts >= min_ratings].index.tolist()

if len(dense_users) > top_n_users:
    dense_users = dense_users[:top_n_users]

print(f"Seleccionados {len(dense_users)} usuarios con al menos {min_ratings} ratings cada uno")

df_dense = df[df['user_id'].isin(dense_users)]

print(f"Tamaño del dataset filtrado: {df_dense.shape}")
print(f"Reducción del dataset: {100 * (1 - len(df_dense) / len(df)):.2f}%")

Seleccionados 15517 usuarios con al menos 10 ratings cada uno
Tamaño del dataset filtrado: (284867, 3)
Reducción del dataset: 88.90%


In [5]:
def random_split(data, test_ratio=0.2, random_state=42):
    data_shuffled = data.sample(frac=1, random_state=random_state)
    
    split = int(len(data) * (1 - test_ratio))
    
    train = data_shuffled.iloc[:split]
    test = data_shuffled.iloc[split:]
    
    train_users = set(train['user_id'].unique())
    train_items = set(train['item_id'].unique())
    test = test[test['user_id'].isin(train_users) & test['item_id'].isin(train_items)]
    
    return train, test

train_data, test_data = random_split(df_dense)
print(f"Tamaño del conjunto de entrenamiento: {train_data.shape}")
print(f"Tamaño del conjunto de prueba: {test_data.shape}")

train_users = set(train_data['user_id'].unique())
train_items = set(train_data['item_id'].unique())
common_users = len(train_users.intersection(set(test_data['user_id'].unique())))
common_items = len(train_items.intersection(set(test_data['item_id'].unique())))

print(f"Usuarios comunes en train y test: {common_users}/{len(train_users)}")
print(f"Items comunes en train y test: {common_items}/{len(train_items)}")

Tamaño del conjunto de entrenamiento: (227893, 3)
Tamaño del conjunto de prueba: (53735, 3)
Usuarios comunes en train y test: 14541/15517
Items comunes en train y test: 15474/34132


#### Creación de matrices

Mapeos para los datos de entrenamiento

In [6]:
train_users_list = train_data['user_id'].unique()
train_items_list = train_data['item_id'].unique()

user2idx = {user_id: idx for idx, user_id in enumerate(train_users_list)}
item2idx = {item_id: idx for idx, item_id in enumerate(train_items_list)}

idx2user = {idx: user for user, idx in user2idx.items()}
idx2item = {idx: item for item, idx in item2idx.items()}

NUM_USERS = len(user2idx)
NUM_ITEMS = len(item2idx)
MIN_RATING = df['rating'].min()
MAX_RATING = df['rating'].max()

print(f"Usuarios: {NUM_USERS}, Items: {NUM_ITEMS}")

Usuarios: 15517, Items: 34132


Matriz de train sparse

In [7]:
# Inicializar matriz vacía
train_matrix = sp.lil_matrix((NUM_USERS, NUM_ITEMS))

# Llenar con valores sin acumular
for _, row in train_data.iterrows():
    if row['user_id'] in user2idx and row['item_id'] in item2idx:
        u_idx = user2idx[row['user_id']]
        i_idx = item2idx[row['item_id']]
        # Asignar directamente, sin acumular
        train_matrix[u_idx, i_idx] = row['rating']

# Convertir a CSR para operaciones eficientes
train_matrix = train_matrix.tocsr()

Matriz de test sparse

In [8]:
test_matrix = sp.lil_matrix((NUM_USERS, NUM_ITEMS))

# Llenar con valores sin acumular
for _, row in test_data.iterrows():
    if row['user_id'] in user2idx and row['item_id'] in item2idx:
        u_idx = user2idx[row['user_id']]
        i_idx = item2idx[row['item_id']]
        # Asignar directamente, sin acumular
        test_matrix[u_idx, i_idx] = row['rating']

# Convertir a CSR para operaciones eficientes
test_matrix = test_matrix.tocsr()

Medidas

In [9]:
print(f"Forma de la matriz de train: {train_matrix.shape}")
print(f"Densidad de la matriz de train: {train_matrix.nnz / (NUM_USERS * NUM_ITEMS):.6f}")
print(f"Forma de la matriz de test: {test_matrix.shape}")
print(f"Densidad de la matriz de test: {test_matrix.nnz / (NUM_USERS * NUM_ITEMS):.6f}")

Forma de la matriz de train: (15517, 34132)
Densidad de la matriz de train: 0.000416
Forma de la matriz de test: (15517, 34132)
Densidad de la matriz de test: 0.000101


#### Cáclulo de similaridad

correlation similarity

In [10]:
def correlation_similarity(u, v):
    if u == v:
        return sp.identity(NUM_USERS, format='csr')
        
    u_items = set(train_matrix[u].indices)
    v_items = set(train_matrix[v].indices)
    common_items = u_items.intersection(v_items)
    
    if len(common_items) < 2:
        return sp.csr_matrix((1, NUM_USERS))
    
    u_ratings = [train_matrix[u, i] for i in common_items]
    v_ratings = [train_matrix[v, i] for i in common_items]
    
    avg_u = np.mean(u_ratings) if len(u_ratings) > 0 else 0
    avg_v = np.mean(v_ratings) if len(v_ratings) > 0 else 0
    
    num = 0
    den_u = 0
    den_v = 0

    for i in common_items:
        r_u = float(train_matrix[u, i])
        r_v = float(train_matrix[v, i])
        
        num += (r_u - avg_u) * (r_v - avg_v)
        den_u += (r_u - avg_u) ** 2
        den_v += (r_v - avg_v) ** 2
    
    if num == 0 or den_u == 0 or den_v == 0:
        return sp.csr_matrix((1, NUM_USERS))
        
    sim = num / (np.sqrt(den_u) * np.sqrt(den_v))
    
    return sp.csr_matrix(([sim], ([0], [v])), shape=(1, NUM_USERS))

jmsd similarity

In [11]:
def jmsd_similarity(u, v):
    if u == v:
        return sp.identity(NUM_USERS, format='csr')
    
    u_items = set(train_matrix[u].indices)
    v_items = set(train_matrix[v].indices)
    common_items = u_items.intersection(v_items)
    all_items = u_items.union(v_items)
    
    intersection = len(common_items)
    union = len(all_items)
    
    if intersection == 0:
        return sp.csr_matrix((1, NUM_USERS))
    
    rating_range = MAX_RATING - MIN_RATING
    
    diff_sum = 0
    for i in common_items:
        # Normalizamos las calificaciones
        r_u = (float(train_matrix[u, i]) - MIN_RATING) / rating_range
        r_v = (float(train_matrix[v, i]) - MIN_RATING) / rating_range
        
        diff_sum += (r_u - r_v) ** 2
    
    msd = diff_sum / intersection
    jaccard = intersection / union
    
    sim = jaccard * (1 - msd)
    
    return sp.csr_matrix(([sim], ([0], [v])), shape=(1, NUM_USERS))

Hacemos un pequeño test para comprobar que funciona.  
En la mayoria de casos da 0.0, aunque de vez en cuando encuentra usuarios con algo de similaridad.  
Esto ocurre porque nuestro dataset es bastante disperso

In [12]:
u, v = random.randint(0, NUM_USERS-1), random.randint(0, NUM_USERS-1)
corr_sim = correlation_similarity(u, v)[0, v] if u != v else 1.0
jmsd_sim = jmsd_similarity(u, v)[0, v] if u != v else 1.0
print(f"Similitud entre usuario {idx2user[u]} y {idx2user[v]}: Correlación = {corr_sim:.6f}, JMSD = {jmsd_sim:.6f}")

Similitud entre usuario A34FNHVP8TJNWD y A1Y043ABRDVQC0: Correlación = 0.000000, JMSD = 0.000000


#### Obtener vecinos

In [13]:
def get_neighbors(u, similarities):
    k = 25
    neighbors = []
    
    for v, sim in enumerate(similarities):
        if v != u and sim is not None:
            if sim.nnz > 0:
                sim_value = sim[0, v]
                neighbors.append((v, sim_value))
    
    neighbors.sort(key=lambda x: x[1], reverse=True)
    return neighbors[:k]

In [14]:
u = 112
similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]
neighbors_corr = get_neighbors(u, similarities_corr)
neighbors_jmsd = get_neighbors(u, similarities_jmsd)
print(neighbors_corr)
print(neighbors_jmsd)

[(1766, np.float64(0.9999999999999998)), (7517, np.float64(0.9999999999999998)), (2017, np.float64(-0.944911182523068)), (3493, np.float64(-0.9999999999999998)), (4004, np.float64(-0.9999999999999998))]
[(15196, np.float64(0.0798611111111111)), (10963, np.float64(0.06481481481481481)), (11574, np.float64(0.057692307692307696)), (15188, np.float64(0.05555555555555555)), (15301, np.float64(0.05555555555555555)), (1766, np.float64(0.052734375)), (2048, np.float64(0.05263157894736842)), (737, np.float64(0.05208333333333333)), (2281, np.float64(0.05208333333333333)), (6378, np.float64(0.05208333333333333)), (9301, np.float64(0.05208333333333333)), (11008, np.float64(0.05208333333333333)), (7498, np.float64(0.05)), (12163, np.float64(0.05)), (146, np.float64(0.049342105263157895)), (12177, np.float64(0.049342105263157895)), (8201, np.float64(0.047619047619047616)), (9121, np.float64(0.046875)), (10224, np.float64(0.046875)), (13670, np.float64(0.046875)), (14290, np.float64(0.045454545454545

#### Estimación de las predicciones
En el caso de que no haya los suficientes datos, se devolverá la media global del item. Es un término conocido como 'fallback' que se usa en casos reales

Media del item

In [15]:
def global_average(i):
    users_who_rated = [u for u in range(NUM_USERS) if train_matrix[u, i] != 0]
    if users_who_rated:
        return sum(train_matrix[u, i] for u in users_who_rated) / len(users_who_rated)
    return 3.0

Media

In [16]:
def average_prediction(u, i, neighbors):
    count = 0
    sum_ratings = 0
    
    for v, _ in neighbors:
        if train_matrix[v, i] != 0:
            sum_ratings += train_matrix[v, i]
            count += 1
    
    if count > 0:
        return sum_ratings / count
    
    return global_average(i)

In [17]:
u = 67
i = 1865
predicted_corr = average_prediction(u, i, neighbors_corr)
predicted_jmsd = average_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

4.631578947368421
4.631578947368421


Media ponderada

In [18]:
def weighted_average_prediction(u, i, neighbors):
    numerator = 0
    denominator = 0
    
    for v, sim in neighbors:
        if train_matrix[v, i] != 0:
            numerator += sim * train_matrix[v, i]
            denominator += abs(sim)
    
    if denominator > 0:
        return numerator / denominator
    
    return global_average(i)

In [19]:
u = 67
i = 571
predicted_corr = weighted_average_prediction(u, i, neighbors_corr)
predicted_jmsd = weighted_average_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

4.326732673267327
4.326732673267327


Media de agregación

In [20]:
def deviation_from_mean_prediction(u, i, neighbors):
    u_ratings = train_matrix[u].data
    avg_u = np.mean(u_ratings) if len(u_ratings) > 0 else global_average(i)
    
    sum_deviation = 0
    count = 0
    
    for v, _ in neighbors:
        if train_matrix[v, i] != 0:
            v_ratings = train_matrix[v].data
            avg_v = np.mean(v_ratings) if len(v_ratings) > 0 else 0
            
            sum_deviation += (train_matrix[v, i] - avg_v)
            count += 1
    
    if count > 0:
        prediction = avg_u + (sum_deviation / count)
        return prediction
    
    return global_average(i)

In [21]:
u = 3
i = 2716
predicted_corr = deviation_from_mean_prediction(u, i, neighbors_corr)
predicted_jmsd = deviation_from_mean_prediction(u, i, neighbors_jmsd)
print(predicted_corr)
print(predicted_jmsd)

4.208791208791209
4.208791208791209


#### Calculo de recomendaciones

In [22]:
def get_recommendations(predictions):
    recommendations = [None for _ in range(len(predictions))]
    
    for n in range(len(predictions)):
        max_value = None
        item = None
        
        for i, value in enumerate(predictions[n]):
            if value is not None and (max_value is None or value > max_value):
                max_value = value
                item = i
        
        recommendations[n] = item
    
    return recommendations

In [23]:
N = 10 #Vamos a usar 10 usuarios para mostrar recomendaciones
M = NUM_ITEMS

test_users = random.sample(range(NUM_USERS), N)

neighbors_corr = {}
neighbors_jmsd = {}
for u in test_users:
    similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
    similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]
    neighbors_corr[u] = get_neighbors(u, similarities_corr)
    neighbors_jmsd[u] = get_neighbors(u, similarities_jmsd)

In [24]:
avg_predictions_corr = [[None if train_matrix[u, i] != 0 else average_prediction(u, i, neighbors_corr[u]) 
                        for i in range(100)] for u in test_users]
avg_predictions_jmsd = [[None if train_matrix[u, i] != 0 else average_prediction(u, i, neighbors_jmsd[u]) 
                        for i in range(100)] for u in test_users]

wavg_predictions_corr = [[None if train_matrix[u, i] != 0 else weighted_average_prediction(u, i, neighbors_corr[u]) 
                        for i in range(100)] for u in test_users]
wavg_predictions_jmsd = [[None if train_matrix[u, i] != 0 else weighted_average_prediction(u, i, neighbors_jmsd[u]) 
                        for i in range(100)] for u in test_users]

dfm_predictions_corr = [[None if train_matrix[u, i] != 0 else deviation_from_mean_prediction(u, i, neighbors_corr[u]) 
                        for i in range(100)] for u in test_users]
dfm_predictions_jmsd = [[None if train_matrix[u, i] != 0 else deviation_from_mean_prediction(u, i, neighbors_jmsd[u]) 
                        for i in range(100)] for u in test_users]

KeyboardInterrupt: 

In [None]:
recommendations_avg_corr = get_recommendations(avg_predictions_corr)
recommendations_avg_jmsd = get_recommendations(avg_predictions_jmsd)
recommendations_wavg_corr = get_recommendations(wavg_predictions_corr)
recommendations_wavg_jmsd = get_recommendations(wavg_predictions_jmsd)
recommendations_dfm_corr = get_recommendations(dfm_predictions_corr)
recommendations_dfm_jmsd = get_recommendations(dfm_predictions_jmsd)

print(recommendations_avg_corr)
print(recommendations_avg_jmsd)
print(recommendations_wavg_corr)
print(recommendations_wavg_jmsd)
print(recommendations_dfm_corr)
print(recommendations_dfm_jmsd)

[66, 79, 7, 53, 54, 79, 19, 79, 26, 79]
[51, 79, 79, 21, 38, 79, 59, 79, 8, 66]
[79, 79, 7, 53, 54, 79, 19, 79, 79, 79]
[51, 79, 79, 21, 89, 79, 59, 92, 26, 66]
[69, 79, 35, 53, 79, 79, 19, 79, 26, 79]
[51, 79, 79, 79, 79, 79, 59, 79, 83, 66]


#### Calculo del MAE

In [None]:
def has_test_ratings(u):
    return test_matrix[u].nnz > 0

Para poder hacer las predicciones de múltiples algoritmos, vimos necesario hacer un prefiltrado de posibles vecinos en bloques de 50  
para reducir drásticamente el tiempo de ejecución (anteriormente estuvo 1 día y no terminó).  
Actualmente estamos comparando similitudes únicamente con vecinos potenciales. (90mins aprox)

In [None]:
import time
print("Iniciando ejecución...")
start_time = time.time()

MAX_USERS = 100
MAX_POTENTIAL_NEIGHBORS = 30

print(f"Seleccionando hasta {MAX_USERS} usuarios con ratings en test...")
test_users = []
for u in range(min(1000, NUM_USERS)):
    if u % 100 == 0:
        print(f"Verificando usuario {u}...")
    if test_matrix[u].nnz > 0:
        test_users.append(u)
        if len(test_users) >= MAX_USERS:
            break

print(f"Encontrados {len(test_users)} usuarios con ratings en test en {time.time()-start_time:.2f}s")

predictions = {
    'avg_corr': {},
    'avg_jmsd': {},
    'wavg_corr': {},
    'wavg_jmsd': {},
    'dfm_corr': {},
    'dfm_jmsd': {}
}

# Procesar usuarios individualmente (sin lotes para simplificar)
for u_idx, u in enumerate(test_users):
    print(f"Procesando usuario {u_idx+1}/{len(test_users)}: {u}")
    user_start = time.time()
    
    # Obtener ítems calificados en test (pre-calcular para evitar llamadas repetidas)
    test_items = test_matrix[u].nonzero()[1]
    print(f"  Usuario tiene {len(test_items)} ítems en test")
    
    # Simplificación radical: usar solo 5 vecinos aleatorios para prueba inicial
    if u_idx == 0:  # Solo para el primer usuario para prueba
        print("  PRUEBA RÁPIDA: Usando solo 5 vecinos aleatorios")
        potential_neighbors = random.sample(range(NUM_USERS), min(5, NUM_USERS))
        if u in potential_neighbors:
            potential_neighbors.remove(u)
    else:
        # Obtener vecinos potenciales limitados
        potential_neighbors = set()
        for i in train_matrix[u].indices:
            rated_i = train_matrix[:, i].nonzero()[0]
            potential_neighbors.update(rated_i[:10])  # Limitar usuarios por ítem
            if len(potential_neighbors) >= MAX_POTENTIAL_NEIGHBORS:
                break
        
        if u in potential_neighbors:
            potential_neighbors.remove(u)
            
        potential_neighbors = list(potential_neighbors)[:MAX_POTENTIAL_NEIGHBORS]
    
    print(f"  Seleccionados {len(potential_neighbors)} vecinos potenciales en {time.time()-user_start:.2f}s")
    
    # Calcular similitudes y vecinos (versión simplificada)
    neighbors_corr = []
    neighbors_jmsd = []
    
    sim_start = time.time()
    for v in potential_neighbors:
        try:
            # Correlación
            corr_sim = correlation_similarity(u, v)
            if corr_sim.nnz > 0:
                sim_val = corr_sim[0, v]
                neighbors_corr.append((v, sim_val))
            
            # JMSD
            jmsd_sim = jmsd_similarity(u, v)
            if jmsd_sim.nnz > 0:
                sim_val = jmsd_sim[0, v]
                neighbors_jmsd.append((v, sim_val))
        except Exception as e:
            print(f"    Error calculando similitud entre {u} y {v}: {e}")
    
    print(f"  Similitudes calculadas en {time.time()-sim_start:.2f}s. Vecinos encontrados: {len(neighbors_corr)}/{len(neighbors_jmsd)}")
    
    # Ordenar vecinos por similitud
    neighbors_corr.sort(key=lambda x: x[1], reverse=True)
    neighbors_jmsd.sort(key=lambda x: x[1], reverse=True)
    
    # Solo mantener los 10 mejores vecinos para cada método
    neighbors_corr = neighbors_corr[:10]
    neighbors_jmsd = neighbors_jmsd[:10]
    
    # Calcular solo 1 ítem como prueba
    if len(test_items) > 0:
        i = test_items[0]
        print(f"  PRUEBA: Prediciendo para ítem {i}")
        
        # Calcular predicciones con diferentes métodos
        try:
            predictions['avg_corr'][u] = {i: average_prediction(u, i, neighbors_corr)}
            predictions['avg_jmsd'][u] = {i: average_prediction(u, i, neighbors_jmsd)}
            predictions['wavg_corr'][u] = {i: weighted_average_prediction(u, i, neighbors_corr, 'corr')}
            predictions['wavg_jmsd'][u] = {i: weighted_average_prediction(u, i, neighbors_jmsd, 'jmsd')}
            predictions['dfm_corr'][u] = {i: deviation_from_mean_prediction(u, i, neighbors_corr)}
            predictions['dfm_jmsd'][u] = {i: deviation_from_mean_prediction(u, i, neighbors_jmsd)}
            
            print(f"  Predicciones para ítem {i}:")
            for method, preds in predictions.items():
                if u in preds and i in preds[u]:
                    print(f"    {method}: {preds[u][i]}")
        except Exception as e:
            print(f"    Error calculando predicciones: {e}")
    
    print(f"  Usuario procesado en {time.time()-user_start:.2f}s")
    
    # Solo procesar algunos usuarios para prueba inicial
    if u_idx >= 2:  # Después de 3 usuarios
        user_decision = input("¿Continuar con el siguiente usuario? (s/n): ")
        if user_decision.lower() != 's':
            print("Deteniendo procesamiento para análisis")
            break

print(f"Tiempo total: {time.time()-start_time:.2f}s")

Iniciando ejecución...
Seleccionando hasta 100 usuarios con ratings en test...


NameError: name 'NUM_USERS' is not defined

Ahora definimos el MAE y lo calculamos

In [None]:
def get_user_mae(u, predictions):
    mae = 0
    count = 0
    
    test_items = test_matrix[u].nonzero()[1]
    
    for i in test_items:
        if predictions[u][i] is not None:
            mae += abs(test_matrix[u, i] - predictions[u][i])
            count += 1
    
    if count > 0:
        return mae / count
    else:
        return None
    

In [None]:
def get_mae(predictions):
    mae = 0
    count = 0
    
    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_mae = get_user_mae(u, predictions)
            
            if user_mae is not None:
                mae += user_mae
                count += 1
    
    if count > 0:
        return mae / count
    else:
        return None

In [None]:
mae_avg_corr = get_mae(avg_predictions_corr)
mae_avg_jmsd = get_mae(avg_predictions_jmsd)
mae_wavg_corr = get_mae(wavg_predictions_corr)
mae_wavg_jmsd = get_mae(wavg_predictions_jmsd)
mae_dfm_corr = get_mae(dfm_predictions_corr)
mae_dfm_jmsd = get_mae(dfm_predictions_jmsd)

print(f"MAE Avg Prediction with Corr Similarity = " + str(mae_avg_corr))
print(f"MAE Avg Prediction with JMSD Similarity = " + str(mae_avg_jmsd))
print(f"MAE Weighted Avg Prediction with Corr Similarity = " + str(mae_wavg_corr))
print(f"MAE Weighted Avg Prediction with JMSD Similarity = " + str(mae_wavg_jmsd))
print(f"MAE Dev from Mean Prediction with Corr Similarity = " + str(mae_dfm_corr))
print(f"MAE Dev from Mean Prediction with JMSD Similarity = " + str(mae_dfm_jmsd))

MAE Avg Prediction with Corr Similarity = 1.1606368045457007
MAE Avg Prediction with JMSD Similarity = 1.0568724053169671
MAE Weighted Avg Prediction with Corr Similarity = 1.6286280183648738
MAE Weighted Avg Prediction with JMSD Similarity = 1.1158895577077275
MAE Dev from Mean Prediction with Corr Similarity = 1.1511359965649157
MAE Dev from Mean Prediction with JMSD Similarity = 1.0690044704402293


#### Cálculo del resto de medidas
Las adaptaciones para las matrices sparse utilizadas se centran acceso eficiente a datos no nulos mediante `nonzero()`,  
reemplazando verificaciones basadas en `None` por comprobaciones de valores distintos de cero,  
y empleando pre-filtrado de vecinos para reducir drásticamente los pares de comparación

rmse

In [None]:
def get_user_rmse(u, predictions):
    mse = 0
    count = 0
    
    test_items = test_matrix[u].nonzero()[1]
    
    for i in test_items:
        if predictions[u][i] is not None:
            mse += (test_matrix[u, i] - predictions[u][i]) ** 2
            count += 1
    
    if count > 0:
        return math.sqrt(mse / count)
    else:
        return None

In [None]:
def get_rmse(predictions):
    rmse = 0
    count = 0
    
    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_rmse = get_user_rmse(u, predictions)
            
            if user_rmse is not None:
                rmse += user_rmse
                count += 1
    
    if count > 0:
        return rmse / count
    else:
        return None

In [None]:
rmse_avg_corr = get_rmse(avg_predictions_corr)
rmse_avg_jmsd = get_rmse(avg_predictions_jmsd)
rmse_wavg_corr = get_rmse(wavg_predictions_corr)
rmse_wavg_jmsd = get_rmse(wavg_predictions_jmsd)
rmse_dfm_corr = get_rmse(dfm_predictions_corr)
rmse_dfm_jmsd = get_rmse(dfm_predictions_jmsd)

print(f"RMSE Avg Prediction with Corr Similarity = " + str(rmse_avg_corr))
print(f"RMSE Avg Prediction with JMSD Similarity = " + str(rmse_avg_jmsd))
print(f"RMSE Weighted Avg Prediction with Corr Similarity = " + str(rmse_wavg_corr))
print(f"RMSE Weighted Avg Prediction with JMSD Similarity = " + str(rmse_wavg_jmsd))
print(f"RMSE Dev from Mean Prediction with Corr Similarity = " + str(rmse_dfm_corr))
print(f"RMSE Dev from Mean Prediction with JMSD Similarity = " + str(rmse_dfm_jmsd))

RMSE Avg Prediction with Corr Similarity = 1.4411773904659608
RMSE Avg Prediction with JMSD Similarity = 1.3207952092747248
RMSE Weighted Avg Prediction with Corr Similarity = 2.1292938788818896
RMSE Weighted Avg Prediction with JMSD Similarity = 1.3801907019328008
RMSE Dev from Mean Prediction with Corr Similarity = 1.4055793052285834
RMSE Dev from Mean Prediction with JMSD Similarity = 1.3014693348194508


Precision

In [None]:
theta = 4

In [None]:
def get_user_precision(u, predictions):
    precision = 0
    count = 0
    
    user_predictions = predictions[u]
    recommendations = get_recommendations([user_predictions])[0]
    
    if recommendations is None:
        return None
    
    if not isinstance(recommendations, list):
        recommendations = [recommendations]
    
    for i in recommendations:
        if i is not None and test_matrix[u, i] != 0:
            precision += 1 if test_matrix[u, i] >= theta else 0
            count += 1
    
    if count > 0:
        return precision / count
    else:
        return None

In [None]:
def get_precision(predictions):
    precision = 0
    count = 0
    
    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_precision = get_user_precision(u, predictions)
            
            if user_precision is not None:
                precision += user_precision
                count += 1
    
    if count > 0:
        return precision / count
    else:
        return None

In [None]:
precision_avg_corr = get_precision(avg_predictions_corr)
precision_avg_jmsd = get_precision(avg_predictions_jmsd)
precision_wavg_corr = get_precision(wavg_predictions_corr)
precision_wavg_jmsd = get_precision(wavg_predictions_jmsd)
precision_dfm_corr = get_precision(dfm_predictions_corr)
precision_dfm_jmsd = get_precision(dfm_predictions_jmsd)

print(f"Precision Avg Prediction with Corr Similarity = " + str(precision_avg_corr))
print(f"Precision Avg Prediction with JMSD Similarity = " + str(precision_avg_jmsd))
print(f"Precision Weighted Avg Prediction with Corr Similarity = " + str(precision_wavg_corr))
print(f"Precision Weighted Avg Prediction with JMSD Similarity = " + str(precision_wavg_jmsd))
print(f"Precision Dev from Mean Prediction with Corr Similarity = " + str(precision_dfm_corr))
print(f"Precision Dev from Mean Prediction with JMSD Similarity = " + str(precision_dfm_jmsd))

Precision Avg Prediction with Corr Similarity = 0.8413140311804009
Precision Avg Prediction with JMSD Similarity = 0.8485523385300668
Precision Weighted Avg Prediction with Corr Similarity = 0.8449331848552338
Precision Weighted Avg Prediction with JMSD Similarity = 0.8463251670378619
Precision Dev from Mean Prediction with Corr Similarity = 0.8346325167037862
Precision Dev from Mean Prediction with JMSD Similarity = 0.8382516703786191


Recall

In [None]:
def get_user_recall(u, predictions):
    recall = 0
    count = 0
    
    recommendations = get_recommendations([predictions[u]])[0]
    
    if recommendations is not None and not isinstance(recommendations, list):
        recommendations = [recommendations]
    
    test_items = test_matrix[u].nonzero()[1]
    
    for i in test_items:
        if test_matrix[u, i] >= theta:
            recall += 1 if recommendations is not None and i in recommendations else 0
            count += 1
    
    if count > 0:
        return recall / count
    else:
        return None

def get_recall(predictions):
    recall = 0
    count = 0
    
    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_recall = get_user_recall(u, predictions)
            
            if user_recall is not None:
                recall += user_recall
                count += 1
    
    if count > 0:
        return recall / count
    else:
        return None

In [None]:
recall_avg_corr = get_recall(avg_predictions_corr)
recall_avg_jmsd = get_recall(avg_predictions_jmsd)
recall_wavg_corr = get_recall(wavg_predictions_corr)
recall_wavg_jmsd = get_recall(wavg_predictions_jmsd)
recall_dfm_corr = get_recall(dfm_predictions_corr)
recall_dfm_jmsd = get_recall(dfm_predictions_jmsd)

print(f"Recall Avg Prediction with Corr Similarity = " + str(recall_avg_corr))
print(f"Recall Avg Prediction with JMSD Similarity = " + str(recall_avg_jmsd))
print(f"Recall Weighted Avg Prediction with Corr Similarity = " + str(recall_wavg_corr))
print(f"Recall Weighted Avg Prediction with JMSD Similarity = " + str(recall_wavg_jmsd))
print(f"Recall Dev from Mean Prediction with Corr Similarity = " + str(recall_dfm_corr))
print(f"Recall Dev from Mean Prediction with JMSD Similarity = " + str(recall_dfm_jmsd))

Recall Avg Prediction with Corr Similarity = 0.2516962920865665
Recall Avg Prediction with JMSD Similarity = 0.25189842325149303
Recall Weighted Avg Prediction with Corr Similarity = 0.2524028900153866
Recall Weighted Avg Prediction with JMSD Similarity = 0.25097690649664295
Recall Dev from Mean Prediction with Corr Similarity = 0.25006582972749164
Recall Dev from Mean Prediction with JMSD Similarity = 0.24775415081536542


F1

In [None]:
def get_user_f1(u, predictions):
    precision = get_user_precision(u, predictions)
    recall = get_user_recall(u, predictions)
    
    if precision is None or recall is None:
        return None
    elif precision == 0 and recall == 0:
        return 0
    else:
        return 2 * precision * recall / (precision + recall)

In [None]:
def get_f1(predictions):
    f1 = 0
    count = 0
    
    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_f1 = get_user_f1(u, predictions)
            
            if user_f1 is not None:
                f1 += user_f1
                count += 1
    
    if count > 0:
        return f1 / count
    else:
        return None

In [None]:
f1_avg_corr = get_f1(avg_predictions_corr)
f1_avg_jmsd = get_f1(avg_predictions_jmsd)
f1_wavg_corr = get_f1(wavg_predictions_corr)
f1_wavg_jmsd = get_f1(wavg_predictions_jmsd)
f1_dfm_corr = get_f1(dfm_predictions_corr)
f1_dfm_jmsd = get_f1(dfm_predictions_jmsd)

print(f"F1 Avg Prediction with Corr Similarity = " + str(f1_avg_corr))
print(f"F1 Avg Prediction with JMSD Similarity = " + str(f1_avg_jmsd))
print(f"F1 Weighted Avg Prediction with Corr Similarity = " + str(f1_wavg_corr))
print(f"F1 Weighted Avg Prediction with JMSD Similarity = " + str(f1_wavg_jmsd))
print(f"F1 Dev from Mean Prediction with Corr Similarity = " + str(f1_dfm_corr))
print(f"F1 Dev from Mean Prediction with JMSD Similarity = " + str(f1_dfm_jmsd))

F1 Avg Prediction with Corr Similarity = 0.3581890340662306
F1 Avg Prediction with JMSD Similarity = 0.36006940984671765
F1 Weighted Avg Prediction with Corr Similarity = 0.3596215829987796
F1 Weighted Avg Prediction with JMSD Similarity = 0.35884672306847526
F1 Dev from Mean Prediction with Corr Similarity = 0.3556556407422389
F1 Dev from Mean Prediction with JMSD Similarity = 0.35440112118697564


nDCG

In [None]:
def get_sorted_test_items(u):
    test_items = test_matrix[u].nonzero()[1]
    items = []
    
    if len(test_items) == 0:
        return items
    
    for i in test_items:
        items.append(i)
    
    items.sort(key=lambda x: test_matrix[u, x], reverse=True)
    
    return items

In [None]:
def get_user_idcg(u):
    items = get_sorted_test_items(u)
    idcg = 0
    
    for pos, i in enumerate(items):
        if test_matrix[u, i] >= theta:
            idcg += (2 ** test_matrix[u, i] - 1) / math.log2(pos + 2)
    
    return idcg

In [None]:
def get_user_dcg(u, recommendations):
    dcg = 0
    
    if recommendations is None:
        return 0
    
    if not isinstance(recommendations, list):
        recommendations = [recommendations]
    
    for pos, i in enumerate(recommendations):
        if i is not None and test_matrix[u, i] != 0:
            dcg += (2 ** test_matrix[u, i] - 1) / math.log2(pos + 2)
    
    return dcg

In [None]:
def get_user_ndcg(u, predictions):
    recommendations = get_recommendations([predictions[u]])[0]
    
    dcg = get_user_dcg(u, recommendations)
    idcg = get_user_idcg(u)
    
    if idcg == 0:
        return None
    else:
        return dcg / idcg

In [None]:
def get_ndcg(predictions):
    ndcg = 0
    count = 0
    
    for u in range(NUM_USERS):
        if has_test_ratings(u):
            user_ndcg = get_user_ndcg(u, predictions)
            
            if user_ndcg is not None:
                ndcg += user_ndcg
                count += 1
    
    if count > 0:
        return ndcg / count
    else:
        return None

In [None]:
ndcg_avg_corr = get_ndcg(avg_predictions_corr)
ndcg_avg_jmsd = get_ndcg(avg_predictions_jmsd)
ndcg_wavg_corr = get_ndcg(wavg_predictions_corr)
ndcg_wavg_jmsd = get_ndcg(wavg_predictions_jmsd)
ndcg_dfm_corr = get_ndcg(dfm_predictions_corr)
ndcg_dfm_jmsd = get_ndcg(dfm_predictions_jmsd)

print(f"nDCG Avg Prediction with Corr Similarity = " + str(ndcg_avg_corr))
print(f"nDCG Avg Prediction with JMSD Similarity = " + str(ndcg_avg_jmsd))
print(f"nDCG Weighted Avg Prediction with Corr Similarity = " + str(ndcg_wavg_corr))
print(f"nDCG Weighted Avg Prediction with JMSD Similarity = " + str(ndcg_wavg_jmsd))
print(f"nDCG Dev from Mean Prediction with Corr Similarity = " + str(ndcg_dfm_corr))
print(f"nDCG Dev from Mean Prediction with JMSD Similarity = " + str(ndcg_dfm_jmsd))

nDCG Avg Prediction with Corr Similarity = 0.37926206511411514
nDCG Avg Prediction with JMSD Similarity = 0.3843460771291534
nDCG Weighted Avg Prediction with Corr Similarity = 0.381210272924732
nDCG Weighted Avg Prediction with JMSD Similarity = 0.38245378689367787
nDCG Dev from Mean Prediction with Corr Similarity = 0.377133062693343
nDCG Dev from Mean Prediction with JMSD Similarity = 0.3782215877148551
