In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import StandardScaler
import pyarrow.parquet as pq
import pyarrow as pa
import operator

In [2]:
df_model = pd.read_parquet('data/recommendation_model.parquet')

In [3]:
df_model

Unnamed: 0,user_id,item_name,combined_metric
0,76561197970982479,Killing Floor,1.3
1,js41637,Barbie™ Dreamhouse Party™,1.3
2,evcentric,Risk of Rain,1.3
3,doctr,The Wolf Among Us,1.3
4,maplemage,Dark Souls: Prepare to Die Edition,1.0
...,...,...,...
51034,76561198107177722,BattleBlock Theater,1.3
51035,kushikushigani,LEGO® Worlds,0.7
51036,76561198111410893,Unturned,1.0
51037,zaza147,Fistful of Frags,1.3


In [4]:
# Primero se comienza creando la tabla pivot
piv_table = df_model.pivot_table(index=['user_id'], columns=['item_name'], values='combined_metric')
piv_table

item_name,! That Bastard Is Trying To Steal Our Gold !,0RBITALIS,"10,000,000",100% Orange Juice,1001 Spikes,12 Labours of Hercules,12 Labours of Hercules II: The Cretan Bull,12 is Better Than 6,123 Slaughter Me Street,140,...,klocki,liteCam Game: 100 FPS Game Capture,oO,planetarian ~the reverie of a little planet~,resident evil 4 / biohazard 4,sZone-Online,the static speaks my name,theBlu,theHunter,theHunter: Primal
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,,,,,,,,,,,...,,,,,,,,,,
--ace--,,,,,,,,,,,...,,,,,,,,,,
--ionex--,,,,,,,,,,,...,,,,,,,,,,
-2SV-vuLB-Kg,,,,,,,,,,,...,,,,,,,,,,
-Azsael-,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zv_odd,,,,,,,,,,,...,,,,,,,,,,
zvanik,,,,,,,,,,,...,,,,,,,,,,
zwanzigdrei,,,,,,,,,,,...,,,,,,,,,,
zyr0n1c,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# Se normaliza la matriz pivot con la formula de normalizacion
piv_table_norm = piv_table.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

In [6]:
# Se transpone la matriz, para tener juegos en filas
piv_table_norm.fillna(0, inplace=True)
piv_table_norm = piv_table_norm.T
# Se manteniene solo las columnas que tienen al menos un valor distinto de cero
piv_table_norm = piv_table_norm.loc[:, (piv_table_norm != 0).any(axis=0)]

In [7]:
piv_table_norm

user_id,--000--,-Beave-,-GM-Dragon,-I_AM_EPIC-,-SEVEN-,-Thyme-,-kainey9777,00000000000000000001227,00690069006900,01189958889189157253,...,zimran,zjaerya135,zombi_anon,zomgCoBfAce,zoom-the-flash,zoozles,zourock,zsharoarkbr,zvanik,zzoptimuszz
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
! That Bastard Is Trying To Steal Our Gold !,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100% Orange Juice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001 Spikes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
theBlu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Reemplazar los valores infinitos con un valor específico
piv_table_norm.replace([np.inf, -np.inf], np.nan, inplace=True)
piv_table_norm.fillna(0, inplace=True)

In [9]:
# Se crea una tabla sparse que solo guarda valores distintos de cero y permite optimizar 
piv_sparse = sp.sparse.csr_matrix(piv_table_norm.values)
piv_sparse

<3294x8420 sparse matrix of type '<class 'numpy.float64'>'
	with 32688 stored elements in Compressed Sparse Row format>

In [10]:
# Se deja en filas para cada uno de los endpoints el respectivo items o user segun corresponda, para hacer operaciones
item_simil = cosine_similarity(piv_sparse)
user_simil = cosine_similarity(piv_sparse.T)

In [11]:
# Se ordenan en df para poder exportarlos luego y para manejarlos más sencillo
df_item_simil = pd.DataFrame(item_simil, index = piv_table_norm.index, columns = piv_table_norm.index)
df_user_simil = pd.DataFrame(user_simil, index = piv_table_norm.columns, columns = piv_table_norm.columns)

In [12]:
df_item_simil.head(2)

item_name,! That Bastard Is Trying To Steal Our Gold !,0RBITALIS,"10,000,000",100% Orange Juice,1001 Spikes,12 Labours of Hercules,12 Labours of Hercules II: The Cretan Bull,12 is Better Than 6,123 Slaughter Me Street,140,...,klocki,liteCam Game: 100 FPS Game Capture,oO,planetarian ~the reverie of a little planet~,resident evil 4 / biohazard 4,sZone-Online,the static speaks my name,theBlu,theHunter,theHunter: Primal
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
! That Bastard Is Trying To Steal Our Gold !,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_user_simil.head(2)

user_id,--000--,-Beave-,-GM-Dragon,-I_AM_EPIC-,-SEVEN-,-Thyme-,-kainey9777,00000000000000000001227,00690069006900,01189958889189157253,...,zimran,zjaerya135,zombi_anon,zomgCoBfAce,zoom-the-flash,zoozles,zourock,zsharoarkbr,zvanik,zzoptimuszz
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Beave-,0.0,1.0,0.0,-0.27591,0.052244,0.0,0.0,0.0,0.0,-0.060882,...,0.051844,0.364994,-0.27591,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#Ahora se hace una funcion para que tenga de entrada el juego y el dataframe y busque los juegos más similares
def recommended_games_item(game, df_item_simil):
    similar_games = {}
    count = 1
    for item in df_item_simil.sort_values(by=game, ascending=False).index[1:6]:
        similar_games[f"Recomendación {count}"] = item
        count += 1
    return similar_games

In [15]:
recommended_games_item('0RBITALIS',df_item_simil)

{'Recomendación 1': 'Revelations 2012',
 'Recomendación 2': 'Resident Evil Revelations 2 / Biohazard Revelations 2',
 'Recomendación 3': 'Resident Evil™: Operation Raccoon City',
 'Recomendación 4': 'Resilience: Wave Survival',
 'Recomendación 5': 'Retention'}

In [16]:
'''
    Ahora vamos a crear una funcion para las recomendaciones basadas en un usuario, para esto
    se tomarán las calificaciones que tienen los usuarios similares 
    y las veces que se recomienda cada juego por los usuarios similares.

'''

def similar_user_recs(user):
    
    # Se verifica si el usuario está presente en las columnas de piv_table_norm
    if user not in piv_table_norm.columns:
        return {'message': 'El Usuario no tiene datos disponibles {}'.format(user)}

    # Se obtienen los usuarios más similares 
    sim_users = df_user_simil.sort_values(by=user, ascending=False).index[1:11]

    best = []  
    most_common = {}  

    # Por cada usuario similar, encuentra el juego mejor calificado y lo agrega a la lista 'best'
    for i in sim_users:
        max_score = piv_table_norm.loc[:, i].max()
        best.append(piv_table_norm[piv_table_norm.loc[:, i] == max_score].index.tolist())

    # Se cuenta cuántas veces se recomienda cada juego
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1

    # Se ordenan los juegos de mayor recomendacion
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)

    return dict(sorted_list[:5])

In [20]:
similar_user_recs('76561197970982479')

{'Killing Floor': 2,
 'Duke Nukem 3D: Megaton Edition': 1,
 'The Elder Scrolls V: Skyrim': 1,
 'Counter-Strike: Global Offensive': 1,
 'Metro: Last Light': 1}

In [22]:
pq.write_table(pa.Table.from_pandas(piv_table_norm), 'data/piv_table_norm.parquet')
pq.write_table(pa.Table.from_pandas(df_user_simil), 'data/df_user_simil.parquet')
pq.write_table(pa.Table.from_pandas(df_item_simil), 'data/df_item_simil.parquet')