In [4]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from useful_tools import tipo_de_datos

Para generar nuestro modelo de recomendación utilizaremos como base el dataset "steam_games", sin embargo, agregaremos datos importantes de los otros dos datasets

In [5]:
# Cargar las tablas desde los archivos parquet
steam_games = pq.read_table("steam_games.parquet").to_pandas()
users_items = pq.read_table("users_items.parquet").to_pandas()
user_reviews = pq.read_table("user_reviews.parquet").to_pandas()

Comencemos por user_reviews.

In [6]:
tipo_de_datos.datatype_per_column(user_reviews)


Columna 'user_id': [<class 'str'>]
Columna 'item_id': [<class 'int'>]
Columna 'recommend': [<class 'bool'>]
Columna 'sentiment_analysis': [<class 'int'>]


Vamos a cambiar el formato y realizar algunas agrupaciones para que los datos nos sean más útiles

In [7]:
# Convertir True a 1 y False a 0 en la columna 'recommend'
user_reviews['recommend'] = user_reviews['recommend'].astype(int)

# Agrupar los datos por user_id y sumar los valores de recommend y sentiment_analysis
user_reviews = user_reviews.groupby('item_id')[['recommend','sentiment_analysis']].sum().reset_index()

user_reviews

Unnamed: 0,item_id,recommend,sentiment_analysis
0,10,55,77
1,20,11,21
2,30,3,5
3,40,1,2
4,50,3,6
...,...,...,...
3677,521340,2,4
3678,521430,1,0
3679,521570,2,1
3680,521990,1,2


Continuemos con users_items

In [8]:
# Seleccionar las columnas deseadas
users_items = users_items[['item_id', 'playtime_forever']]

# Agrupar los datos por user_id y sumar los valores de playtime_forever
users_items = users_items.groupby('item_id')['playtime_forever'].sum().reset_index()

users_items

Unnamed: 0,item_id,playtime_forever
0,10,4752.182778
1,20,266.812222
2,30,210.104167
3,40,42.895556
4,50,201.818056
...,...,...
10045,527570,0.000556
10046,527810,0.000556
10047,527890,0.000278
10048,527900,0.012222


Por ultimo steam_games. Para que el modelo de similitud del coseno funcione correctamente, utilizaremos solamente el primer genero de la lista para cada juego.

In [9]:
# Expandir las filas para cada género
steam_games = steam_games.explode('genres')

# Eliminar duplicados por el ID
steam_games = steam_games.drop_duplicates(subset='id')

# Convertir id en entero
steam_games['id'] = steam_games['id'].astype(int)

# Seleccionar las columnas deseadas
steam_games = steam_games[['id','app_name','genres']]

# Eliminar las filas que contienen "no genre" en la columna "genres"
steam_games = steam_games[steam_games['genres'] != 'no genre']

steam_games


Unnamed: 0,id,app_name,genres
88310,761140,Lost Summoner Kitty,Action
88311,643980,Ironbound,Free to Play
88312,670290,Real Pool 3D - Poolians,Casual
88313,767400,弹炸人2222,Action
88314,773570,Log Challenge,Action
...,...,...,...
120440,773640,Colony On Mars,Casual
120441,733530,LOGistICAL: South Africa,Casual
120442,610660,Russian Roads,Indie
120443,658870,EXIT 2 - Directions,Casual


Ahora agregaremos los datos de users_items y user_reviews al dataframe steam_games

In [10]:
# Renombrar la columna 'item_id' en users_items y user_reviews
users_items.rename(columns={'item_id': 'user_item_id'}, inplace=True)
user_reviews.rename(columns={'item_id': 'user_item_id'}, inplace=True)

# Fusionar steam_games con users_items en base a la columna 'id' y 'user_item_id'
steam_games = steam_games.merge(users_items, left_on='id', right_on='user_item_id', how='left')

# Fusionar el resultado anterior con user_reviews en base a la columna 'id' y 'user_item_id'
steam_games = steam_games.merge(user_reviews, left_on='id', right_on='user_item_id', how='left')

# Eliminar las columnas 'user_item_id' sobrantes
steam_games.drop(['user_item_id_x', 'user_item_id_y'], axis=1, inplace=True)

# Verificar el resultado
steam_games


Unnamed: 0,id,app_name,genres,playtime_forever,recommend,sentiment_analysis
0,761140,Lost Summoner Kitty,Action,,,
1,643980,Ironbound,Free to Play,,,
2,670290,Real Pool 3D - Poolians,Casual,,,
3,767400,弹炸人2222,Action,,,
4,773570,Log Challenge,Action,,,
...,...,...,...,...,...,...
31201,773640,Colony On Mars,Casual,,,
31202,733530,LOGistICAL: South Africa,Casual,,,
31203,610660,Russian Roads,Indie,,,
31204,658870,EXIT 2 - Directions,Casual,,,


In [11]:
# Eliminar filas con datos nulos en las columnas especificadas
steam_games.dropna(subset=['playtime_forever', 'recommend', 'sentiment_analysis'], inplace=True)

# Convertir id en entero
steam_games['recommend'] = steam_games['recommend'].astype(int)
steam_games['sentiment_analysis'] = steam_games['sentiment_analysis'].astype(int)

steam_games

Unnamed: 0,id,app_name,genres,playtime_forever,recommend,sentiment_analysis
27,282010,Carmageddon Max Pack,Action,2.588611,1,1
28,70,Half-Life,Action,736.373889,60,82
41,2400,The Ship: Murder Party,Action,286.374167,22,29
55,3800,Advent Rising,Action,3.717778,1,0
56,1520,DEFCON,Indie,60.595556,6,8
...,...,...,...,...,...,...
31180,40,Deathmatch Classic,Action,42.895556,1,2
31181,130,Half-Life: Blue Shift,Action,121.343889,5,9
31186,30,Day of Defeat,Action,210.104167,3,5
31188,13230,Unreal Tournament 2004: Editor's Choice Edition,Action,95.859167,6,8


In [12]:
# Ordenar el DataFrame por playtime_forever en orden descendente
steam_games_sorted = steam_games.sort_values(by='playtime_forever', ascending=False)

# Seleccionar las primeras mil filas
steam_games = steam_games_sorted.head(1000)

steam_games

Unnamed: 0,id,app_name,genres,playtime_forever,recommend,sentiment_analysis
1028,730,Counter-Strike: Global Offensive,Action,215532.978889,3429,4926
58,4000,Garry's Mod,Indie,122741.951667,1657,2322
30337,105600,Terraria,Action,42499.345556,711,1091
29365,230410,Warframe,Action,34090.792500,557,865
31084,240,Counter-Strike: Source,Action,30920.751667,226,351
...,...,...,...,...,...,...
388,42120,Lead and Gold: Gangs of the Wild West,Action,36.067222,0,0
27259,308420,Ziggurat,Action,35.911944,8,16
22439,449140,Istrolid,Free to Play,35.814722,5,9
27974,302380,Floating Point,Casual,35.686944,14,16


In [13]:
# Seleccionamos las características numéricas para calcular la similitud del coseno
juegos_features = steam_games[['playtime_forever','recommend','sentiment_analysis']]

# Agregamos el género como características codificadas
juegos_features = pd.concat([juegos_features, pd.get_dummies(steam_games['genres'])], axis=1)

# Normalizamos las características para asegurarnos de que tengan la misma escala
scaler = StandardScaler()
juegos_features_normalized = scaler.fit_transform(juegos_features)

# Calculamos la similitud del coseno entre los juegos
cosine_sim = cosine_similarity(juegos_features_normalized, juegos_features_normalized)

# Convertimos la matriz de similitud en un DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, index=steam_games['app_name'], columns=steam_games['app_name'])


In [14]:
cosine_sim_df

app_name,Counter-Strike: Global Offensive,Garry's Mod,Terraria,Warframe,Counter-Strike: Source,Left 4 Dead 2,PAYDAY 2,Sid Meier's Civilization® V,Rust,Borderlands 2,...,Sol Survivor,Paranautical Activity: Deluxe Atonement Edition,LISA,Painkiller Hell & Damnation,Zeno Clash,Lead and Gold: Gangs of the Wild West,Ziggurat,Istrolid,Floating Point,How to Survive 2
app_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Counter-Strike: Global Offensive,1.000000,0.981383,0.993541,0.989205,0.886644,0.971509,0.941473,0.691600,0.949353,0.967311,...,-0.121167,-0.267809,-0.103261,-0.277923,-0.258579,-0.298759,-0.241921,-0.071680,-0.064754,-0.291909
Garry's Mod,0.981383,1.000000,0.962291,0.955754,0.882251,0.923937,0.886573,0.686949,0.893823,0.916626,...,0.050571,-0.344756,-0.095168,-0.353640,-0.336702,-0.371775,-0.322102,-0.065889,-0.059547,-0.365830
Terraria,0.993541,0.962291,1.000000,0.999263,0.895727,0.984295,0.956567,0.657609,0.966403,0.985288,...,-0.165467,-0.166725,-0.144084,-0.177330,-0.156876,-0.199522,-0.139202,-0.098870,-0.093435,-0.192213
Warframe,0.989205,0.955754,0.999263,1.000000,0.905519,0.982092,0.952296,0.645471,0.963698,0.985126,...,-0.179465,-0.130213,-0.157136,-0.140791,-0.120362,-0.162997,-0.102708,-0.107553,-0.102783,-0.155681
Counter-Strike: Source,0.886644,0.882251,0.895727,0.905519,1.000000,0.813165,0.735037,0.563370,0.763388,0.828051,...,-0.196922,0.028348,-0.176886,0.021775,0.034439,0.007917,0.045309,-0.120132,-0.120565,0.012468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lead and Gold: Gangs of the Wild West,-0.298759,-0.371775,-0.199522,-0.162997,0.007917,-0.192145,-0.227808,-0.460273,-0.187773,-0.143649,...,-0.354470,0.999132,-0.330132,0.999594,0.998524,1.000000,0.997030,-0.219918,-0.236652,0.999955
Ziggurat,-0.241921,-0.322102,-0.139202,-0.102708,0.045309,-0.125232,-0.156882,-0.428052,-0.117707,-0.076512,...,-0.371138,0.999351,-0.344757,0.998820,0.999737,0.997030,1.000000,-0.229835,-0.246178,0.997713
Istrolid,-0.071680,-0.065889,-0.098870,-0.107553,-0.120132,-0.104890,-0.095925,-0.057865,-0.106414,-0.116683,...,-0.015529,-0.225510,-0.014390,-0.223741,-0.227093,-0.219918,-0.229835,1.000000,-0.009638,-0.221202
Floating Point,-0.064754,-0.059547,-0.093435,-0.102783,-0.120565,-0.098479,-0.088720,-0.054008,-0.099781,-0.111211,...,-0.018045,-0.242022,-0.016549,-0.240342,-0.243549,-0.236652,-0.246178,-0.009638,1.000000,-0.237893


In [15]:
cosine_sim_df.to_parquet('cosine_sim_df.parquet')

In [16]:
def recomendar_juegos(juego, cosine_sim_df, n=5):
    # Verificar si el juego está en el DataFrame
    if juego not in cosine_sim_df.index:
        return(f"El juego '{juego}' no se encuentra en la base de datos.")
    
    # Obtener la fila de similitud del juego dado
    sim_row = cosine_sim_df.loc[juego]
    
    # Ordenar las similitudes en orden descendente y obtener los índices de los juegos más similares
    most_similar_indices = sim_row.drop(juego).sort_values(ascending=False).head(n).index
    
    # Crear una lista con los juegos más similares
    juegos_similares = [juego_similar for juego_similar in most_similar_indices]
    
    return juegos_similares

In [17]:
# Ejemplo de uso
juego_elegido = 'Counter-Strike'
recomendar_juegos(juego_elegido, cosine_sim_df)


['Dungeon Defenders',
 'SMITE®',
 'Battlefield: Bad Company™ 2',
 'Borderlands',
 'TERA']