In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import gzip
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
with gzip.open('steam_games.json.gz', 'rt', encoding='utf-8') as f:
    for line in f:
        df=pd.read_json(f, lines=True)

In [3]:
df.dropna(how='all', inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32135 entries, 88309 to 120443
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 3.4+ MB


In [5]:
df=df[['id','app_name','genres','tags','publisher']]

In [6]:
# Expandimos los géneros y las estiquétas en nuevas filas
df_exploded = df.explode('genres').explode('tags')

# Creamos columnas binarias para cada género y etiquétas
df_binary = pd.get_dummies(df_exploded, columns=['genres', 'tags'], prefix=['genres', 'tag'])

# Agrupamos por el índice original y sumamos para obtener 1 si el juego tiene el género o etiqueta, y 0 si no
df_binary_grouped = df_binary.groupby(level=0).sum()

# Combina el DataFrame binario con el original
df_final = df.merge(df_binary_grouped, left_index=True, right_index=True, how='left')

# Rellena los valores NaN con 0
df_final = df_final.fillna(0)

  df_binary_grouped = df_binary.groupby(level=0).sum()


In [7]:
del(df_exploded,df_binary,df_binary_grouped,df)

In [8]:
df_final.drop(columns=['id_y','genres','tags'], inplace=True)

In [9]:
df_final=df_final.reset_index()

In [10]:
df_modelo=df_final.drop(columns=['app_name','publisher'])

In [18]:
df_modelo.head()

Unnamed: 0,index,id_x,genres_Accounting,genres_Action,genres_Adventure,genres_Animation &amp; Modeling,genres_Audio Production,genres_Casual,genres_Design &amp; Illustration,genres_Early Access,...,tag_Warhammer 40K,tag_Web Publishing,tag_Werewolves,tag_Western,tag_Word Game,tag_World War I,tag_World War II,tag_Wrestling,tag_Zombies,tag_e-sports
0,88309,761140.0,0,5,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
1,88310,643980.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,88311,670290.0,0,0,0,0,0,6,0,0,...,0,0,0,0,0,0,0,0,0,0
3,88312,767400.0,0,3,3,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
4,88313,773570.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_modelo['id_x']=df_modelo['id_x'].astype(int)

In [20]:
#creamos la matriz de numpy para el modelo
recomendation_matrix = df_modelo.values
recomendation_matrix = np.where(recomendation_matrix > 1, 1, recomendation_matrix)

In [21]:
def build_recommendation_dict(df, matrix):
    # Dividimos la matriz en submatrices más pequeñas
    sub_len = 1000
    sub_matrices = [matrix[i:i+sub_len] for i in range(0, len(matrix), sub_len)]

    # Inicializamos la matriz de similitud del coseno
    cosine_sim = np.zeros((len(matrix), len(matrix)))

    # Calculamos la similitud del coseno para cada submatriz y actualizar la matriz
    for sub_matriz1 in sub_matrices:
        for sub_matriz2 in sub_matrices:
            sim_matriz = cosine_similarity(sub_matriz1, sub_matriz2)
            cosine_sim[:len(sub_matriz1), :len(sub_matriz2)] += sim_matriz

    # Creamos un diccionario para almacenar las recomendaciones
    recommendations_dict = {}

    # Calculamos y almacenamos las recomendaciones para cada juego
    for game_id in df['id_x']:
        game_row = df[df['id_x'] == game_id].index[0]

        # Obtenemos las recomendacioespara el juego dado
        sim_scores = list(enumerate(cosine_sim[game_row]))

        # Ordenamos por similitud descendente
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Filtramos el item para evitar la recomendación del mismo item
        sim_scores = [(index, score) for index, score in sim_scores if index != game_row]

        # Obtenemos los índices de los juegos recomendados
        recommended_indices = [index for index, _ in sim_scores[:5]]

        # Obtenemos los id de los juegos recomendados
        recommended_ids = df.loc[recommended_indices, 'id_x'].tolist()

        # Almacenamos las recomendaciones en el diccionario
        recommendations_dict[game_id] = recommended_ids

    return recommendations_dict

# Construimos el diccionario de recomendaciones
recommendations_dict = build_recommendation_dict(df_final, recomendation_matrix)

game_to_recommend_id = 123
recommendations = recommendations_dict.get(game_to_recommend_id, [])
print(f"Recomendaciones para el juego (ID {game_to_recommend_id}): {recommendations}")



Recomendaciones para el juego (ID 123): []


In [23]:
recomendaciones = pd.DataFrame(list(recommendations_dict.items()), columns=['game_id','recommendations'])

In [25]:
recomendaciones['game_id']=recomendaciones['game_id'].astype(int)

In [27]:
recomendaciones.to_csv('recomendaciones.csv', index=False)

In [29]:
game_to_recommend_id = 16868  # Reemplaza con el id real del juego
recommendations = recommendations_dict.get(game_to_recommend_id, [])
print(f"Recomendaciones para el juego (ID {game_to_recommend_id}): {recommendations}")

Recomendaciones para el juego (ID 16868): [773120.0, 213390.0, 55370.0, 108200.0, 97106.0]


In [34]:
del(df_final,df_modelo)

In [35]:
del(recommendations,recommendations_dict,recomendation_matrix)