### Modelo de recomendacion

In [81]:
%pip install --upgrade setuptools




In [82]:
import pandas as pd
import numpy as np

import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator

import pyarrow as pa
import pyarrow.parquet as pq

In [83]:
modelo = pd.read_parquet('data_clean/10-df4_model.parquet')

In [84]:
modelo

Unnamed: 0,user_id,item_name,rating,item_id
0,76561197970982479,Killing Floor,3,1250
1,evcentric,Risk of Rain,5,248820
2,doctr,The Wolf Among Us,5,250320
3,maplemage,Dark Souls: Prepare to Die Edition,3,211420
4,Wackky,LEGO® MARVEL Super Heroes,1,249130
...,...,...,...,...
44087,76561198107177722,BattleBlock Theater,5,238460
44088,kushikushigani,LEGO® Worlds,3,332310
44089,76561198111410893,Unturned,3,304930
44090,zaza147,Fistful of Frags,5,265630


* Se crea una pivot table. Sirve para reorganizar los datos
* En este caso se reorganiza los datos para que item_name sean las columnas y user_id como index

In [85]:
piv = modelo.pivot_table(index=['user_id'], columns=['item_id'], values='rating')
piv

item_id,10,20,30,40,50,60,70,80,130,220,...,510050,512300,512470,514520,516040,521340,521430,521570,521990,527340
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,,,,,,,,,,,...,,,,,,,,,,
--ace--,,,,,,,,,,,...,,,,,,,,,,
--ionex--,,,,,,,,,,,...,,,,,,,,,,
-2SV-vuLB-Kg,,,,,,,,,,,...,,,,,,,,,,
-Azsael-,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuzuga2003,,,,,,,,,,,...,,,,,,,,,,
zv_odd,,,,,,,,,,,...,,,,,,,,,,
zvanik,,,,,,,,,,,...,,,,,,,,,,
zwanzigdrei,,,,,,,,,,,...,,,,,,,,,,


In [86]:
# Normalización del dataframe 'piv'
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# Se borran las columnas que contienen solo cero o no tienen rating, se rellenan los vacíos con 0 y se hace la transpuesta
#la transpuesta es para que item_name sea el index y user_id sea la columna
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm

user_id,--000--,-Beave-,-I_AM_EPIC-,-SEVEN-,-Thyme-,-kainey9777,00000000000000000001227,00690069006900,03092002,04061993,...,zomgCoBfAce,zoom-the-flash,zoozles,zourock,zrustz16,zsharoarkbr,zuzuga2003,zvanik,zwanzigdrei,zzoptimuszz
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Convierto la matriz normalizada a matriz dispersa. Reduce uso de memoria y mejor eficiencia. 

In [87]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)
piv_sparse

<2814x6832 sparse matrix of type '<class 'numpy.float64'>'
	with 24043 stored elements in Compressed Sparse Row format>

In [88]:
#quiero saber cuanto almacenamiento ustiliza
piv_sparse.data.nbytes

192344

Creo dos matrices de similitud utilizando la similitud del coseno para medir la similitud entre los juegos (item_similarity) y entre los usuarios (user_similarity).

La similitud del coseno es una medida comúnmente utilizada para evaluar la similitud entre dos vectores en un espacio multidimensional. En el contexto de sistemas de recomendación y análisis de datos, la similitud del coseno se utiliza para determinar cuán similares son dos conjuntos de datos o elementos, y se calcula utilizando el coseno del ángulo entre los vectores que representan esos datos o elementos.

In [89]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [90]:
item_similarity

array([[ 1.        , -0.07192989, -0.1206293 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.07192989,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.1206293 ,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [91]:
user_similarity

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.        , -0.20412415, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.20412415,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.        , -0.35355339],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.35355339,  1.        ]])

Paso matrices creadas a dataframes.

In [92]:
#item similarity dataframe
similar_item = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
#user similarity dataframe
similar_user = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [93]:
similar_item

item_id,10,20,30,40,50,60,70,80,130,220,...,510050,512300,512470,514520,516040,521340,521430,521570,521990,527340
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.000000,-0.07193,-0.120629,0.0,0.0,0.000000,-0.034126,0.086515,0.000000,-0.041449,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,-0.071930,1.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.070829,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,-0.120629,0.00000,1.000000,0.0,0.0,-0.295008,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0.000000,0.00000,0.000000,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.000000,0.00000,0.000000,0.0,1.0,0.000000,-0.050072,0.000000,0.000000,-0.028620,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521340,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
521430,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521570,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
521990,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
similar_user


user_id,--000--,-Beave-,-I_AM_EPIC-,-SEVEN-,-Thyme-,-kainey9777,00000000000000000001227,00690069006900,03092002,04061993,...,zomgCoBfAce,zoom-the-flash,zoozles,zourock,zrustz16,zsharoarkbr,zuzuga2003,zvanik,zwanzigdrei,zzoptimuszz
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
-Beave-,0.0,1.000000,-0.204124,0.105409,0.000000,0.000000,0.0,0.0,-0.111359,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
-I_AM_EPIC-,0.0,-0.204124,1.000000,0.000000,0.000000,0.000000,0.0,0.0,0.272772,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
-SEVEN-,0.0,0.105409,0.000000,1.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
-Thyme-,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.0,-0.272772,0.0,...,0.288675,0.500000,0.0,0.500000,0.0,0.0,0.288675,0.0,0.500000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zsharoarkbr,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,1.0,0.000000,0.0,0.000000,0.000000
zuzuga2003,0.0,0.000000,0.000000,0.000000,0.288675,0.000000,0.0,0.0,-0.157485,0.0,...,0.000000,0.288675,0.0,0.288675,0.0,0.0,1.000000,0.0,0.288675,0.000000
zvanik,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,1.0,0.000000,0.000000
zwanzigdrei,0.0,0.000000,0.000000,0.000000,0.500000,-0.500000,0.0,0.0,-0.272772,0.0,...,0.288675,0.500000,0.0,0.500000,0.0,0.0,0.288675,0.0,1.000000,-0.353553


Guardo las matrices en formato parquet para ser consumidad por la API

In [95]:
pq.write_table(pa.Table.from_pandas(piv_norm), 'data_clean/11-piv_norm.parquet')
pq.write_table(pa.Table.from_pandas(similar_user), 'data_clean/12-similar_user.parquet') 
pq.write_table(pa.Table.from_pandas(similar_item), 'data_clean/13-similar_item.parquet')


In [96]:
similar_item

item_id,10,20,30,40,50,60,70,80,130,220,...,510050,512300,512470,514520,516040,521340,521430,521570,521990,527340
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.000000,-0.07193,-0.120629,0.0,0.0,0.000000,-0.034126,0.086515,0.000000,-0.041449,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,-0.071930,1.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.070829,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,-0.120629,0.00000,1.000000,0.0,0.0,-0.295008,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0.000000,0.00000,0.000000,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.000000,0.00000,0.000000,0.0,1.0,0.000000,-0.050072,0.000000,0.000000,-0.028620,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521340,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
521430,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521570,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
521990,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
def recomendacionJuego(item_id):
  '''
  Esta función muestra una lista de juegos similares a un item_id dado.

  Parameters:
  ----------
  item_id: El item_id para el cual se desean encontrar item_id similares.

  Returns:
  ----------
  id_similares: Esta función imprime una lista de juegos 5 similares al dado.

  Pasos:
  ----------
  
Verificamos si el juego está en el DataFrame de similitud
Obtenemos la lista de juegos similares y mostrarlos
Imprimimos la lista de juegos similares

  '''

  if item_id not in similar_item.index:
      print(f'No se encontraron juegos similares para {item_id}.')
      return

  similar_juegos = similar_item.sort_values(by=item_id, ascending=False).index[1:6]  # Mostrar siempre los primeros 5

  id_similares = [item for item in similar_juegos]

  return id_similares

In [98]:
item_id = 10 
recomendacionJuego(item_id)

[371570, 80, 41700, 238320, 393410]