# 1. Cargar datos

#### Cargamos las librerías que usaremos

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import math

#### Cargamos el dataset

In [73]:
column_names = ['item_id', 'user_id', 'rating', 'timestamp']

file_path = "./Video_Games.csv"
df = pd.read_csv(file_path, names=column_names, nrows=500000)
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.rename(columns={'timestamp': 'date'}, inplace=True)

df.head()

Unnamed: 0,item_id,user_id,rating,date
0,439381673,A21ROB4YDOZA5P,1.0,2014-06-09
1,439381673,A3TNZ2Q5E7HTHD,3.0,2014-05-10
2,439381673,A1OKRM3QFEATQO,4.0,2014-02-07
3,439381673,A2XO1JFCNEYV3T,1.0,2014-02-07
4,439381673,A19WLPIRHD15TH,4.0,2014-01-16


In [74]:
df.shape

(500000, 4)

#### Dividimos en train y test por fecha

In [75]:
def temporal_split(data, test_ratio=0.2):
    data = data.sort_values('date')
    split = int(len(data) * (1 - test_ratio))
    train = data.iloc[:split]
    test = data.iloc[split:]
    return train, test


In [100]:
xd = df.sample(100000)
x_train, x_test = temporal_split(xd)

#### NUM_USERS, NUM_ITEMS, MIN_RATING, MAX_RATING

# 2. KNN

#### Media

In [101]:
# 1. Crear índices para usuarios e items
user_ids = xd['user_id'].unique()
item_ids = xd['item_id'].unique()
user2idx = {user: idx for idx, user in enumerate(user_ids)}
item2idx = {item: idx for idx, item in enumerate(item_ids)}

NUM_USERS = len(user_ids)
NUM_ITEMS = len(item_ids)
MIN_RATING = xd['rating'].min()
MAX_RATING = xd['rating'].max()



In [102]:
# 2. Crear la matriz de ratings (entrenamiento)
ratings = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
for _, row in x_train.iterrows():
    u = user2idx[row['user_id']]
    i = item2idx[row['item_id']]
    ratings[u][i] = row['rating']



In [103]:
# 3. Crear la matriz de ratings de test
test_ratings = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]
for _, row in x_test.iterrows():
    u = user2idx.get(row['user_id'])
    i = item2idx.get(row['item_id'])
    if u is not None and i is not None:
        test_ratings[u][i] = row['rating']

In [104]:
def rating_average(u):
    acc = 0
    count = 0
    for i in range(NUM_ITEMS):
        if ratings[u][i] is not None:
            acc += ratings[u][i]
            count += 1
    if count == 0:
        return None
    return acc / count

### Cálculo de la similaridad

In [109]:
def correlation_similarity (u, v):
  num = 0

  den_u = 0
  den_v = 0

  count = 0

  avg_u = rating_average(u)
  avg_v = rating_average(v)

  for i in range(NUM_ITEMS):
    if ratings[u][i] != None and ratings[v][i] != None:
      r_u = ratings[u][i]
      r_v = ratings[v][i]

      num += (r_u - avg_u) * (r_v - avg_v)
      den_u += (r_u - avg_u) * (r_u - avg_u)
      den_v += (r_v - avg_v) * (r_v - avg_v)

      count += 1

  if count > 0 and den_u != 0 and den_v != 0:
    cor = num / math.sqrt( den_u * den_v )
    return cor
  else:
    return None

In [83]:
# Testing
print(correlation_similarity(113,534))
print(correlation_similarity(92,355))

None
None


In [111]:
def jmsd_similarity (u, v):

  union = 0
  intersection = 0
  diff = 0

  for i in range(NUM_ITEMS):
    if ratings[u][i] != None and ratings[v][i] != None:
      r_u = (ratings[u][i] - MIN_RATING) / (MAX_RATING - MIN_RATING)
      r_v = (ratings[v][i] - MIN_RATING) / (MAX_RATING - MIN_RATING)

      diff = (r_u - r_v) * (r_u - r_v)

      intersection += 1
      union += 1

    elif ratings[u][i] != None or ratings[v][i] != None:
      union += 1


  if intersection > 0:
    jaccard = intersection / union
    msd = diff / intersection
    return jaccard * (1 - msd)
  else:
    return None

In [85]:
# Testing
print(jmsd_similarity(5,940))
print(jmsd_similarity(113,534))
print(jmsd_similarity(92,355))

None
None
None


### Búsqueda de los k vecinos 

In [86]:
def get_neighbors (u, similarities, k=25):

  neighbors = [None for _ in range(k)]

  for n in range(k):

    max_similarity = 0
    neighbor = None

    for v, sim in enumerate(similarities):
      if v not in neighbors and sim != None and sim > max_similarity:
        max_similarity = sim
        neighbor = v

    neighbors[n] = neighbor

  return neighbors

In [112]:
# Testing
u = 112
similarities_corr = [None if u == v else correlation_similarity(u, v) for v in range(NUM_USERS)]
similarities_jmsd = [None if u == v else jmsd_similarity(u, v) for v in range(NUM_USERS)]
neighbors_corr = get_neighbors(u, similarities_corr)
neighbors_jmsd = get_neighbors(u, similarities_jmsd)
print(neighbors_corr)
print(neighbors_jmsd)

[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


### Estimación de las predicciones

In [88]:
def average_prediction (u, i, neighbors):
  acc = 0
  count = 0

  for n in neighbors:
    if n == None: break

    if ratings[n][i] != None:
      acc += ratings[n][i]
      count += 1

  if count > 0:
    prediction = acc / count
    return prediction
  else:
    return None


In [113]:
i = 324
avgpred_corr = average_prediction (u, i, neighbors_corr)
avgpred_jmsd = average_prediction (u, i, neighbors_jmsd)
print(avgpred_corr)
print(avgpred_jmsd)

None
None


In [114]:
def weighted_average_prediction (u, i, neighbors, similarities):
  num = 0
  den = 0

  for n in neighbors:
    if n == None: break

    if ratings[n][i] != None:
      num += similarities[n] * ratings[n][i]
      den += similarities[n]

  if den > 0:
    prediction = num / den
    return prediction
  else:
    return None

In [115]:
# Testing
wavgpred_corr = weighted_average_prediction (u, i, neighbors_corr, similarities_corr)
wavgpred_jmsd = weighted_average_prediction (u, i, neighbors_jmsd, similarities_jmsd)
print(wavgpred_corr)
print(wavgpred_jmsd)

None
None


In [116]:
def deviation_from_mean_prediction (u, i, neighbors):
  acc = 0
  count = 0

  for n in neighbors:
    if n == None: break

    if ratings[n][i] != None:
      avg_n = rating_average(n)
      acc += ratings[n][i] - avg_n
      count += 1

  if count > 0:
    avg_u = rating_average(u)
    prediction = avg_u + acc / count
    return prediction
  else:
    return None

In [117]:
# Testing
dfmpred_corr = deviation_from_mean_prediction (u, i, neighbors_corr)
dfmpred_jmsd = deviation_from_mean_prediction (u, i, neighbors_jmsd)
print(dfmpred_corr)
print(dfmpred_jmsd)

None
None


### Cálculo de las recomendaciones

### Cálculo del resto de medidas