In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pyarrow.parquet as pq
import pandas as pd
import joblib

In [3]:
# Leo la data necesaria
df_steam_games = pq.read_table('data2/df_steam_games_limpio_final.parquet').to_pandas()

In [4]:
# Selección de columnas relevantes
data = df_steam_games[['id', 'app_name', 'genres', 'tags', 'specs']]

In [None]:
# Limpieza y preprocesamiento de texto
data['combined_features'] = data['genres'] + ' ' + data['tags'] + ' ' + data['specs']

In [14]:
# Elimino el explode realizado 
data = data.groupby('id').first().reset_index()

In [15]:
# Elimino los juegos que tienen id=0, si los hay.
data = data[data['id']!=0]

In [16]:
# Cantidad de juegos diferentes
data['app_name'].nunique()

28766

In [17]:
# Tomo una muesta aleatoria de 5000 juegos, para reducir el peso de la matriz
# de similitud del modelo.
data = data.sample(n=5000)

In [19]:
data.reset_index(drop=True, inplace=True)

In [20]:
data.head()

Unnamed: 0,id,app_name,genres,tags,specs,combined_features
0,610221,night in the woodssoundtrack vol ii,adventure,adventure,singleplayer,adventure adventure singleplayer
1,241950,hacker evolution duality hardcore package part...,indie,indie,singleplayer,indie indie singleplayer
2,718090,princesslootpixelagain x2,action,early access,singleplayer,action early access singleplayer
3,744170,starfighter neon,action,action,singleplayer,action action singleplayer
4,548050,runes,casual,casual,singleplayer,casual casual singleplayer


In [22]:
# Guardo los 5000 juegos seleccionados para su posterior uso
data.to_csv('df_steam_games_selected.csv')

In [None]:
# Guardo el dataframe con compresión parquet
import pyarrow.parquet as pq
data.to_parquet('df_modelo.parquet')

In [25]:
# Instancio el modelo de similitud
CV = CountVectorizer()
converted_metrix = CV.fit_transform(data['combined_features'])

In [26]:
# Calculo la similitud entre los juegos con la distancia coseno
cosine_similarity = cosine_similarity(converted_metrix)

In [30]:
# Guardo la matriz con compresión .pkl
joblib.dump(cosine_similarity, 'Matriz.pkl', compress=1)

['Matriz.pkl']

### Verifico que la data guardada funcione

In [2]:
import joblib
# Leo la data guardada para ver que funciona
with open('data/Matriz.pkl', 'rb') as file:
        modelo = joblib.load(file)

data = pd.read_parquet('data/df_modelo.parquet')

In [3]:
data.head()

Unnamed: 0,id,app_name,genres,tags,specs,combined_features
0,610221,night in the woodssoundtrack vol ii,adventure,adventure,singleplayer,adventure adventure singleplayer
1,241950,hacker evolution duality hardcore package part...,indie,indie,singleplayer,indie indie singleplayer
2,718090,princesslootpixelagain x2,action,early access,singleplayer,action early access singleplayer
3,744170,starfighter neon,action,action,singleplayer,action action singleplayer
4,548050,runes,casual,casual,singleplayer,casual casual singleplayer


In [7]:
# Defino función para obtener la recomenación
def get_recommendations(app_name, cosine_sim=modelo):
    idx = data[data['app_name'] == app_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:7]  # Top 6 juegos similares, incluido él mismo
    game_indices = [i[0] for i in sim_scores]
    return data['app_name'].iloc[game_indices]



In [8]:
# Obtengo la recomendación para alguno de los juegos de la muesta aleatoria
# En el endpoint corrijo el código, para sacar de la lista el juego consultado
recommendations = get_recommendations('pi')
print(recommendations)

20                           sin city
84                              red 2
199                                pi
228          gina yashere skinny btch
244    eddie griffinfreedom of speech
245           josh blue sticky change
Name: app_name, dtype: object
