In [36]:
import pandas as pd
import ast
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
# Cargar los datasets
df_juegos = pd.read_csv('steam_games_api.csv')
df_horas_juego = pd.read_parquet('user_items.parquet')

In [38]:
df_juegos = df_juegos.rename(columns={'id_contenido': 'item_id'})

In [39]:
# Convertir la columna 'item_id' a tipo de datos compatible
df_juegos['item_id'] = df_juegos['item_id'].astype(int)
df_horas_juego['item_id'] = df_horas_juego['item_id'].astype(int)

In [40]:
# Unir datasets por ID de juego
df = pd.merge(df_juegos, df_horas_juego, on='item_id')

In [41]:
df_nuevo= df[['item_id', 'genero', 'titulo','playtime_forever']]

In [42]:
df_nuevo

Unnamed: 0,item_id,genero,titulo,playtime_forever
0,282010,"['Action', 'Indie', 'Racing']",Carmageddon Max Pack,5
1,282010,"['Action', 'Indie', 'Racing']",Carmageddon Max Pack,0
2,282010,"['Action', 'Indie', 'Racing']",Carmageddon Max Pack,0
3,282010,"['Action', 'Indie', 'Racing']",Carmageddon Max Pack,0
4,282010,"['Action', 'Indie', 'Racing']",Carmageddon Max Pack,13
...,...,...,...,...
4038475,80,['Action'],Counter-Strike: Condition Zero,0
4038476,80,['Action'],Counter-Strike: Condition Zero,0
4038477,80,['Action'],Counter-Strike: Condition Zero,0
4038478,80,['Action'],Counter-Strike: Condition Zero,9


# Achicamos el dataset

Para simplificar el sistema, ya que solo estamos realizando un MVP, vamos a tomar el top 100 de juegos mas jugados por los usuarios para realizar el sistema de recomendacion.

Para eso, necesitamos encontrar el top 100 de juegos mas jugados. Importamos el csv de user_items en donde podemos ver la cantidad de horas jugadas por usuario y juego. 

Filtramos el DataFrame unicamente por las columnas que necesitamos, para luego agrupar segun el item_id y la sumatoria de las horas jugadas, ordenamos los valores de mayor a menor y por ultimo filtramos por los primeros 200 items.

In [43]:
top_100_juegos = df_nuevo.sort_values(by='playtime_forever', ascending=False).drop_duplicates(subset=['item_id']).head(100)



In [44]:
top_100_juegos

Unnamed: 0,item_id,genero,titulo,playtime_forever
27742,4000,"['Indie', 'Simulation']",Garry's Mod,642773
841363,212200,"['Action', 'Adventure', 'Casual', 'Free to Pla...",Mabinogi,635295
3422625,8500,"['Action', 'Free to Play', 'Massively Multipla...",EVE Online,530882
3914660,320,['Action'],Half-Life 2: Deathmatch,493791
3898581,240,['Action'],Counter-Strike: Source,479117
...,...,...,...,...
240791,1280,"['Action', 'Simulation']",Darkest Hour: Europe '44-'45,130678
3256504,65800,"['Action', 'Indie', 'RPG', 'Strategy']",Dungeon Defenders,128428
3189945,104700,"['Free to Play', 'Action', 'Strategy', 'Indie'...",Super MNC,126519
1263835,313120,"['Adventure', 'Indie', 'Early Access']",Stranded Deep,124301


Convertimos la columna de item_id en una lista

# Normalizacion de generos

Ahora queremos crear un dataframe para poder comparar la similitud de los items por medio de los generos. Por eso, vamos a realizar el proceso de one-hot enconding para compararlos.

Para eso primero buscamos el listado de generos unicos.

In [45]:
df_generos = top_100_juegos[['genero']]
df_generos['genero'] = df_generos['genero'].apply(eval)
df_generos = df_generos.explode('genero')
df_generos

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_generos['genero'] = df_generos['genero'].apply(eval)


Unnamed: 0,genero
27742,Indie
27742,Simulation
841363,Action
841363,Adventure
841363,Casual
...,...
1263835,Early Access
1462539,Action
1462539,Indie
1462539,RPG


In [46]:
lista_generos = df_generos['genero']
generos_unicos = lista_generos.unique()
generos_unicos

array(['Indie', 'Simulation', 'Action', 'Adventure', 'Casual',
       'Free to Play', 'Massively Multiplayer', 'RPG', 'Strategy',
       'Early Access', 'Utilities', 'Racing', 'Sports'], dtype=object)

In [47]:
# Convertimos la cadena en una lista
genres = top_100_juegos['genero'].apply(eval)

for genre in generos_unicos:
    top_100_juegos[genre] = 0  # Inicializamos todas las columnas en 0

for genre in generos_unicos:
    top_100_juegos.loc[genres.apply(lambda x: genre in x), genre] = 1  # Asignamos 1 a las filas donde el género está presente

top_100_juegos

Unnamed: 0,item_id,genero,titulo,playtime_forever,Indie,Simulation,Action,Adventure,Casual,Free to Play,Massively Multiplayer,RPG,Strategy,Early Access,Utilities,Racing,Sports
27742,4000,"['Indie', 'Simulation']",Garry's Mod,642773,1,1,0,0,0,0,0,0,0,0,0,0,0
841363,212200,"['Action', 'Adventure', 'Casual', 'Free to Pla...",Mabinogi,635295,0,1,1,1,1,1,1,1,0,0,0,0,0
3422625,8500,"['Action', 'Free to Play', 'Massively Multipla...",EVE Online,530882,0,0,1,0,0,1,1,1,1,0,0,0,0
3914660,320,['Action'],Half-Life 2: Deathmatch,493791,0,0,1,0,0,0,0,0,0,0,0,0,0
3898581,240,['Action'],Counter-Strike: Source,479117,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240791,1280,"['Action', 'Simulation']",Darkest Hour: Europe '44-'45,130678,0,1,1,0,0,0,0,0,0,0,0,0,0
3256504,65800,"['Action', 'Indie', 'RPG', 'Strategy']",Dungeon Defenders,128428,1,0,1,0,0,0,0,1,1,0,0,0,0
3189945,104700,"['Free to Play', 'Action', 'Strategy', 'Indie'...",Super MNC,126519,1,0,1,0,0,1,0,0,1,0,0,0,1
1263835,313120,"['Adventure', 'Indie', 'Early Access']",Stranded Deep,124301,1,0,0,1,0,0,0,0,0,1,0,0,0


Filtramos el DataFrame para quedarnos unicamente con el top 200 de juegos mas jugados

In [48]:
#Creamos una tabla pivot con los item_id y las columnas de generos
df_pivot =top_100_juegos.set_index('item_id').drop(['titulo', 'genero','playtime_forever'], axis=1)
df_pivot

Unnamed: 0_level_0,Indie,Simulation,Action,Adventure,Casual,Free to Play,Massively Multiplayer,RPG,Strategy,Early Access,Utilities,Racing,Sports
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4000,1,1,0,0,0,0,0,0,0,0,0,0,0
212200,0,1,1,1,1,1,1,1,0,0,0,0,0
8500,0,0,1,0,0,1,1,1,1,0,0,0,0
320,0,0,1,0,0,0,0,0,0,0,0,0,0
240,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1280,0,1,1,0,0,0,0,0,0,0,0,0,0
65800,1,0,1,0,0,0,0,1,1,0,0,0,0
104700,1,0,1,0,0,1,0,0,1,0,0,0,1
313120,1,0,0,1,0,0,0,0,0,1,0,0,0


In [49]:
genero_genero_sim_matrix = pd.DataFrame(cosine_similarity(df_pivot))
genero_genero_sim_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.267261,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.408248,0.000000,0.000000,0.000000,0.500000,0.500000,0.353553,0.316228,0.408248,0.707107
1,0.267261,1.000000,0.676123,0.377964,0.377964,0.436436,0.000000,0.654654,0.534522,0.377964,...,0.218218,0.534522,0.000000,0.534522,0.534522,0.534522,0.377964,0.338062,0.218218,0.566947
2,0.000000,0.676123,1.000000,0.447214,0.447214,0.774597,0.447214,0.774597,0.632456,0.447214,...,0.258199,0.316228,0.447214,0.632456,0.316228,0.316228,0.670820,0.600000,0.000000,0.447214
3,0.000000,0.377964,0.447214,1.000000,1.000000,0.000000,0.000000,0.577350,0.707107,1.000000,...,0.577350,0.000000,0.000000,0.000000,0.707107,0.707107,0.500000,0.447214,0.000000,0.500000
4,0.000000,0.377964,0.447214,1.000000,1.000000,0.000000,0.000000,0.577350,0.707107,1.000000,...,0.577350,0.000000,0.000000,0.000000,0.707107,0.707107,0.500000,0.447214,0.000000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.500000,0.534522,0.316228,0.707107,0.707107,0.000000,0.000000,0.408248,0.500000,0.707107,...,0.408248,0.000000,0.000000,0.000000,1.000000,1.000000,0.353553,0.316228,0.000000,0.707107
96,0.353553,0.377964,0.670820,0.500000,0.500000,0.288675,0.500000,0.288675,0.353553,0.500000,...,0.577350,0.353553,0.500000,0.353553,0.353553,0.353553,1.000000,0.670820,0.288675,0.750000
97,0.316228,0.338062,0.600000,0.447214,0.447214,0.516398,0.447214,0.516398,0.632456,0.447214,...,0.516398,0.000000,0.447214,0.000000,0.316228,0.316228,0.670820,1.000000,0.258199,0.447214
98,0.408248,0.218218,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.408248,0.000000,0.000000,0.000000,0.000000,0.288675,0.258199,1.000000,0.288675


In [50]:
genero_genero_sim_matrix.columns = df_pivot.index
genero_genero_sim_matrix.set_index(pd.Index(df_pivot.index), inplace=True)

In [51]:
genero_genero_sim_matrix

item_id,4000,212200,8500,320,240,47410,8930,113400,230410,730,...,220820,265610,22130,306130,201310,1280,65800,104700,313120,290080
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4000,1.000000,0.267261,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.408248,0.000000,0.000000,0.000000,0.500000,0.500000,0.353553,0.316228,0.408248,0.707107
212200,0.267261,1.000000,0.676123,0.377964,0.377964,0.436436,0.000000,0.654654,0.534522,0.377964,...,0.218218,0.534522,0.000000,0.534522,0.534522,0.534522,0.377964,0.338062,0.218218,0.566947
8500,0.000000,0.676123,1.000000,0.447214,0.447214,0.774597,0.447214,0.774597,0.632456,0.447214,...,0.258199,0.316228,0.447214,0.632456,0.316228,0.316228,0.670820,0.600000,0.000000,0.447214
320,0.000000,0.377964,0.447214,1.000000,1.000000,0.000000,0.000000,0.577350,0.707107,1.000000,...,0.577350,0.000000,0.000000,0.000000,0.707107,0.707107,0.500000,0.447214,0.000000,0.500000
240,0.000000,0.377964,0.447214,1.000000,1.000000,0.000000,0.000000,0.577350,0.707107,1.000000,...,0.577350,0.000000,0.000000,0.000000,0.707107,0.707107,0.500000,0.447214,0.000000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1280,0.500000,0.534522,0.316228,0.707107,0.707107,0.000000,0.000000,0.408248,0.500000,0.707107,...,0.408248,0.000000,0.000000,0.000000,1.000000,1.000000,0.353553,0.316228,0.000000,0.707107
65800,0.353553,0.377964,0.670820,0.500000,0.500000,0.288675,0.500000,0.288675,0.353553,0.500000,...,0.577350,0.353553,0.500000,0.353553,0.353553,0.353553,1.000000,0.670820,0.288675,0.750000
104700,0.316228,0.338062,0.600000,0.447214,0.447214,0.516398,0.447214,0.516398,0.632456,0.447214,...,0.516398,0.000000,0.447214,0.000000,0.316228,0.316228,0.670820,1.000000,0.258199,0.447214
313120,0.408248,0.218218,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.408248,0.000000,0.000000,0.000000,0.000000,0.288675,0.258199,1.000000,0.288675


In [52]:
genero_genero_sim_matrix.to_csv('df_ml.csv')

In [53]:
def recomendacion_juego(id_juego):
    juegos_similares = genero_genero_sim_matrix.loc[id_juego].sort_values(ascending=False)

    # Obtener el top 5 excluyendo el primero
    top_juegos_similares = juegos_similares.iloc[1:].nlargest(5)
    lista_de_ids = top_juegos_similares.index.tolist()

    titulos_top = top_100_juegos.loc[top_100_juegos['item_id'].isin(lista_de_ids), 'titulo']
    lista_de_juegos = [f"{i + 1}- {titulo}" for i, titulo in enumerate(titulos_top.values)]

    return f'Para el id introducido: {id_juego}, te recomendamos los siguientes juegos similares:\n' + '\n'.join(lista_de_juegos)

In [55]:
recomendacion_juego(4000)

'Para el id introducido: 4000, te recomendamos los siguientes juegos similares:\n1- Kerbal Space Program\n2- Shower With Your Dad Simulator 2015: Do You Still Shower With Your Dad\n3- Prison Architect\n4- Pony Island\n5- Guns of Icarus Online'