In [619]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform


In [620]:
# cargamos el archivo
df_recomendacion = pd.read_parquet('df_recomendacion.parquet', engine='fastparquet', index=False)

In [621]:
# eliminamos el indice
df_recomendacion.drop(['index'], axis=1, inplace=True)

In [622]:
# para el modelo tome el ttulo y las etiquetas porque considero que contienen una mejor descripción de las caracteriticas que el genero
df_recomendacion

Unnamed: 0,title,tags
0,Half-Life,FPS
1,Half-Life,Classic
2,Half-Life,Action
3,Half-Life,Sci-fi
4,Half-Life,Singleplayer
...,...,...
14625,Counter-Strike: Condition Zero,Survival
14626,Counter-Strike: Condition Zero,Atmospheric
14627,Counter-Strike: Condition Zero,Dark
14628,Counter-Strike: Condition Zero,Simulation


In [623]:
# usamos la funcion crosstab de pandas para transformar los datos
tabla_cross = pd.crosstab(df_recomendacion['title'], df_recomendacion['tags'])
tabla_cross

tags,1980s,1990's,2.5D,2D,2D Fighter,3D Platformer,3D Vision,4 Player Local,4X,Abstract,...,Wargame,Warhammer 40K,Web Publishing,Werewolves,Western,World War I,World War II,Wrestling,Zombies,e-sports
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
140,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1931: Scheherazade at the Library of Pergamum,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1943 Megami Strike,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
openCanvas 6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
rFactor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sZone-Online,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
theHunter Classic,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [624]:
# imprimimos el coeficiente jaccard entre 2 juegos
juego1=tabla_cross.loc['140']
juego2=tabla_cross.loc['rFactor']
print(jaccard_score(juego1,juego2))

0.0


In [625]:
# calculo de la distancia jaccard
dist_jaccard = pdist(tabla_cross.values, metric='jaccard')
dist_jaccard

array([0.81481481, 0.8       , 0.86363636, ..., 0.57142857, 0.57142857,
       0.51851852])

In [626]:
# transformamos a matriz rectangular
matriz_cuadrada_dist = squareform(dist_jaccard)
matriz_cuadrada_dist

array([[0.        , 0.81481481, 0.8       , ..., 0.97368421, 0.88571429,
        0.88571429],
       [0.81481481, 0.        , 0.90909091, ..., 0.96875   , 0.93548387,
        0.93548387],
       [0.8       , 0.90909091, 0.        , ..., 0.96666667, 0.89285714,
        0.93103448],
       ...,
       [0.97368421, 0.96875   , 0.96666667, ..., 0.        , 0.57142857,
        0.57142857],
       [0.88571429, 0.93548387, 0.89285714, ..., 0.57142857, 0.        ,
        0.51851852],
       [0.88571429, 0.93548387, 0.93103448, ..., 0.57142857, 0.51851852,
        0.        ]])

In [627]:
# restamos a 1 los valores obtenidos para tener el coeficiente de similitud
coef_similitud_jaccard = 1- matriz_cuadrada_dist
coef_similitud_jaccard

array([[1.        , 0.18518519, 0.2       , ..., 0.02631579, 0.11428571,
        0.11428571],
       [0.18518519, 1.        , 0.09090909, ..., 0.03125   , 0.06451613,
        0.06451613],
       [0.2       , 0.09090909, 1.        , ..., 0.03333333, 0.10714286,
        0.06896552],
       ...,
       [0.02631579, 0.03125   , 0.03333333, ..., 1.        , 0.42857143,
        0.42857143],
       [0.11428571, 0.06451613, 0.10714286, ..., 0.42857143, 1.        ,
        0.48148148],
       [0.11428571, 0.06451613, 0.06896552, ..., 0.42857143, 0.48148148,
        1.        ]])

In [628]:
# creamos un nuevo dataframe
df_jaccard = pd.DataFrame(coef_similitud_jaccard, index=tabla_cross.index, columns=tabla_cross.index)
df_jaccard

title,100% Orange Juice,140,1931: Scheherazade at the Library of Pergamum,1943 Megami Strike,199X,3 Stars of Destiny,3SwitcheD,7 Wonders: Ancient Alien Makeover,"7,62 High Calibre",8-Bit Commando,...,Xenonauts,Ys Origin,Zombie Driver HD,Zuma's Revenge!,eden*,openCanvas 6,rFactor,sZone-Online,theHunter Classic,theHunter: Primal
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,1.000000,0.185185,0.200000,0.136364,0.090909,0.192308,0.142857,0.095238,0.086957,0.153846,...,0.161290,0.166667,0.100000,0.090909,0.230769,0.045455,0.043478,0.026316,0.114286,0.114286
140,0.185185,1.000000,0.090909,0.117647,0.125000,0.136364,0.200000,0.062500,0.055556,0.333333,...,0.111111,0.160000,0.125000,0.200000,0.130435,0.000000,0.000000,0.031250,0.064516,0.064516
1931: Scheherazade at the Library of Pergamum,0.200000,0.090909,1.000000,0.214286,0.142857,0.277778,0.230769,0.071429,0.214286,0.100000,...,0.166667,0.173913,0.086957,0.142857,0.263158,0.071429,0.066667,0.033333,0.107143,0.068966
1943 Megami Strike,0.136364,0.117647,0.214286,1.000000,0.100000,0.125000,0.222222,0.111111,0.000000,0.214286,...,0.045455,0.100000,0.111111,0.222222,0.055556,0.111111,0.000000,0.040000,0.083333,0.040000
199X,0.090909,0.125000,0.142857,0.100000,1.000000,0.307692,0.111111,0.000000,0.000000,0.066667,...,0.047619,0.105263,0.117647,0.000000,0.125000,0.000000,0.000000,0.041667,0.041667,0.041667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
openCanvas 6,0.045455,0.000000,0.071429,0.111111,0.000000,0.066667,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.052632,0.000000,0.000000,0.062500,1.000000,0.000000,0.000000,0.000000,0.000000
rFactor,0.043478,0.000000,0.066667,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,...,0.047619,0.000000,0.117647,0.000000,0.000000,0.000000,1.000000,0.041667,0.136364,0.086957
sZone-Online,0.026316,0.031250,0.033333,0.040000,0.041667,0.066667,0.000000,0.000000,0.040000,0.068966,...,0.027778,0.090909,0.307692,0.041667,0.031250,0.000000,0.041667,1.000000,0.428571,0.428571
theHunter Classic,0.114286,0.064516,0.107143,0.083333,0.041667,0.066667,0.086957,0.090909,0.130435,0.107143,...,0.088235,0.090909,0.259259,0.136364,0.064516,0.000000,0.136364,0.428571,1.000000,0.481481


In [629]:
df_jaccard.to_parquet('df_jaccard.parquet', engine='auto', compression='snappy')

In [630]:
df_recomendacion_csv = pd.read_csv('df_recomendacion.csv')
df_recomendacion_csv

Unnamed: 0,title,tags
0,Half-Life,FPS
1,Half-Life,Classic
2,Half-Life,Action
3,Half-Life,Sci-fi
4,Half-Life,Singleplayer
...,...,...
14625,Counter-Strike: Condition Zero,Survival
14626,Counter-Strike: Condition Zero,Atmospheric
14627,Counter-Strike: Condition Zero,Dark
14628,Counter-Strike: Condition Zero,Simulation


In [631]:
jaccard_csv=pd.read_csv('df_jaccard.csv', index_col='title')
jaccard_csv

Unnamed: 0_level_0,100% Orange Juice,140,1931: Scheherazade at the Library of Pergamum,1943 Megami Strike,199X,3 Stars of Destiny,3SwitcheD,7 Wonders: Ancient Alien Makeover,"7,62 High Calibre",8-Bit Commando,...,Xenonauts,Ys Origin,Zombie Driver HD,Zuma's Revenge!,eden*,openCanvas 6,rFactor,sZone-Online,theHunter Classic,theHunter: Primal
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,1.000000,0.185185,0.200000,0.136364,0.090909,0.192308,0.142857,0.095238,0.086957,0.153846,...,0.161290,0.166667,0.100000,0.090909,0.230769,0.045455,0.043478,0.026316,0.114286,0.114286
140,0.185185,1.000000,0.090909,0.117647,0.125000,0.136364,0.200000,0.062500,0.055556,0.333333,...,0.111111,0.160000,0.125000,0.200000,0.130435,0.000000,0.000000,0.031250,0.064516,0.064516
1931: Scheherazade at the Library of Pergamum,0.200000,0.090909,1.000000,0.214286,0.142857,0.277778,0.230769,0.071429,0.214286,0.100000,...,0.166667,0.173913,0.086957,0.142857,0.263158,0.071429,0.066667,0.033333,0.107143,0.068966
1943 Megami Strike,0.136364,0.117647,0.214286,1.000000,0.100000,0.125000,0.222222,0.111111,0.000000,0.214286,...,0.045455,0.100000,0.111111,0.222222,0.055556,0.111111,0.000000,0.040000,0.083333,0.040000
199X,0.090909,0.125000,0.142857,0.100000,1.000000,0.307692,0.111111,0.000000,0.000000,0.066667,...,0.047619,0.105263,0.117647,0.000000,0.125000,0.000000,0.000000,0.041667,0.041667,0.041667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
openCanvas 6,0.045455,0.000000,0.071429,0.111111,0.000000,0.066667,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.052632,0.000000,0.000000,0.062500,1.000000,0.000000,0.000000,0.000000,0.000000
rFactor,0.043478,0.000000,0.066667,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,...,0.047619,0.000000,0.117647,0.000000,0.000000,0.000000,1.000000,0.041667,0.136364,0.086957
sZone-Online,0.026316,0.031250,0.033333,0.040000,0.041667,0.066667,0.000000,0.000000,0.040000,0.068966,...,0.027778,0.090909,0.307692,0.041667,0.031250,0.000000,0.041667,1.000000,0.428571,0.428571
theHunter Classic,0.114286,0.064516,0.107143,0.083333,0.041667,0.066667,0.086957,0.090909,0.130435,0.107143,...,0.088235,0.090909,0.259259,0.136364,0.064516,0.000000,0.136364,0.428571,1.000000,0.481481


In [632]:
# probamos el modelo
recomendaciones=df_jaccard['Counter-Strike: Condition Zero'].sort_values(ascending=False)
recomendaciones = pd.DataFrame(recomendaciones)
#recomendaciones.reset_index(inplace=True)
recomendaciones


Unnamed: 0_level_0,Counter-Strike: Condition Zero
title,Unnamed: 1_level_1
Counter-Strike: Condition Zero,1.000000
Battlefield: Bad Company™ 2,0.600000
Insurgency,0.600000
Counter-Strike: Source,0.538462
Left 4 Dead,0.538462
...,...
openCanvas 6,0.000000
EasyAntiCheat eSports,0.000000
Elastrix,0.000000
Putt-Putt® and Fatty Bear's Activity Pack,0.000000


In [633]:
#df_recomendacion = pd.read_parquet('df_recomendacion.parquet', engine='fastparquet', index=False)
#df_recomendacion.drop(['index'], axis=1, inplace=True)
#df_jaccard = pd.read_parquet('df_jaccard.parquet', engine='fastparquet', index_col='title')
#df_jaccard.set_index('title', drop=True, inplace=True)
df_jaccard


title,100% Orange Juice,140,1931: Scheherazade at the Library of Pergamum,1943 Megami Strike,199X,3 Stars of Destiny,3SwitcheD,7 Wonders: Ancient Alien Makeover,"7,62 High Calibre",8-Bit Commando,...,Xenonauts,Ys Origin,Zombie Driver HD,Zuma's Revenge!,eden*,openCanvas 6,rFactor,sZone-Online,theHunter Classic,theHunter: Primal
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,1.000000,0.185185,0.200000,0.136364,0.090909,0.192308,0.142857,0.095238,0.086957,0.153846,...,0.161290,0.166667,0.100000,0.090909,0.230769,0.045455,0.043478,0.026316,0.114286,0.114286
140,0.185185,1.000000,0.090909,0.117647,0.125000,0.136364,0.200000,0.062500,0.055556,0.333333,...,0.111111,0.160000,0.125000,0.200000,0.130435,0.000000,0.000000,0.031250,0.064516,0.064516
1931: Scheherazade at the Library of Pergamum,0.200000,0.090909,1.000000,0.214286,0.142857,0.277778,0.230769,0.071429,0.214286,0.100000,...,0.166667,0.173913,0.086957,0.142857,0.263158,0.071429,0.066667,0.033333,0.107143,0.068966
1943 Megami Strike,0.136364,0.117647,0.214286,1.000000,0.100000,0.125000,0.222222,0.111111,0.000000,0.214286,...,0.045455,0.100000,0.111111,0.222222,0.055556,0.111111,0.000000,0.040000,0.083333,0.040000
199X,0.090909,0.125000,0.142857,0.100000,1.000000,0.307692,0.111111,0.000000,0.000000,0.066667,...,0.047619,0.105263,0.117647,0.000000,0.125000,0.000000,0.000000,0.041667,0.041667,0.041667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
openCanvas 6,0.045455,0.000000,0.071429,0.111111,0.000000,0.066667,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.052632,0.000000,0.000000,0.062500,1.000000,0.000000,0.000000,0.000000,0.000000
rFactor,0.043478,0.000000,0.066667,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,...,0.047619,0.000000,0.117647,0.000000,0.000000,0.000000,1.000000,0.041667,0.136364,0.086957
sZone-Online,0.026316,0.031250,0.033333,0.040000,0.041667,0.066667,0.000000,0.000000,0.040000,0.068966,...,0.027778,0.090909,0.307692,0.041667,0.031250,0.000000,0.041667,1.000000,0.428571,0.428571
theHunter Classic,0.114286,0.064516,0.107143,0.083333,0.041667,0.066667,0.086957,0.090909,0.130435,0.107143,...,0.088235,0.090909,0.259259,0.136364,0.064516,0.000000,0.136364,0.428571,1.000000,0.481481


In [634]:
df_recomendacion

Unnamed: 0,title,tags
0,Half-Life,FPS
1,Half-Life,Classic
2,Half-Life,Action
3,Half-Life,Sci-fi
4,Half-Life,Singleplayer
...,...,...
14625,Counter-Strike: Condition Zero,Survival
14626,Counter-Strike: Condition Zero,Atmospheric
14627,Counter-Strike: Condition Zero,Dark
14628,Counter-Strike: Condition Zero,Simulation


In [642]:
# creamos la funcion. Apenas me doy cuenta que me equivoque en lugar de tomar item_id, tome titulo, pero ya no alcanzo a corregir 
def recomendacion_juego(titulo: str) -> list:
    juego = df_recomendacion_csv[df_recomendacion_csv['title'].str.lower().str.contains(titulo, case=False)]
    juego = juego.iloc[0,0]#['title']
    recomendaciones=jaccard_csv[juego].sort_values(ascending=False)
    return list(recomendaciones.index[1:6].tolist())    



In [643]:
recomendacion_juego('REsident')

['Resident Evil™ 5/ Biohazard 5®',
 'Resident Evil Revelations / Biohazard Revelations',
 'Dead Rising 3 Apocalypse Edition',
 'Contagion',
 'Resident Evil 0 / biohazard 0 HD REMASTER']

In [637]:
#try:
   # predict(...)
#except Exception as e:
 #   print(e)