Creación del modelo de ML  

In [1]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pyarrow.parquet as pq
import pandas as pd
import joblib

In [4]:
df_steam_games = pq.read_table(r'../Colab/df_steam_games_limpio_final.parquet').to_pandas()

In [5]:
# elijo las columnas
data = df_steam_games[['id', 'app_name', 'genres', 'tags', 'specs']]

In [7]:
data.head(5)

Unnamed: 0,id,app_name,genres,tags,specs,combined_features
0,761140,lost summoner kitty,action,strategy,single player,action strategy single player
1,761140,lost summoner kitty,action,action,single player,action action single player
2,761140,lost summoner kitty,action,indie,single player,action indie single player
3,761140,lost summoner kitty,action,casual,single player,action casual single player
4,761140,lost summoner kitty,action,simulation,single player,action simulation single player


In [6]:
# Limpieza y preprocesamiento de texto
data['combined_features'] = data['genres'] + ' ' + data['tags'] + ' ' + data['specs']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['combined_features'] = data['genres'] + ' ' + data['tags'] + ' ' + data['specs']


In [8]:
# Elimino el explode realizado y me quedo con el primero de cada 'id' como muestra para achicar el tamaño del archivo
data = data.groupby('id').first().reset_index()

In [9]:
data.head()

Unnamed: 0,id,app_name,genres,tags,specs,combined_features
0,0,none,none,none,none,none none none
1,10,counter strike,action,action,multi player,action action multi player
2,20,team fortress classic,action,action,multi player,action action multi player
3,30,day of defeat,action,fps,multi player,action fps multi player
4,40,deathmatch classic,action,action,multi player,action action multi player


In [10]:
# Elimino los juegos que tienen id=0
data = data[data['id']!=0]

In [12]:
# Cantidad de juegos diferentes
data['app_name'].nunique()

30708

In [13]:
# Tomo una muesta aleatoria de 2500 juegos con fines didacticos para reducir la matriz
# y asi reducir el tamaño del archivo
data = data.sample(n=2500)

In [14]:
data.reset_index(drop=True, inplace=True)

In [None]:
# Guardo los 2500 juegos seleccionados para su posterior uso
data.to_csv('juegos_escogidos.csv')

In [None]:
# Guardo el dataframe con compresión parquet
import pyarrow.parquet as pq
data.to_parquet('df_modelo.parquet')

In [None]:
# Instancio el modelo de similitud
CV = CountVectorizer()
converted_metrix = CV.fit_transform(data['combined_features'])

In [None]:
# Calculo la similitud entre los juegos con la distancia coseno
cosine_similarity = cosine_similarity(converted_metrix)

In [None]:
# Guardo la matriz con compresión .pkl
joblib.dump(cosine_similarity, 'Matriz.pkl', compress=1)

In [None]:
# Defino función para obtener la recomenación
def get_recommendations(app_name, cosine_sim=modelo):
    idx = data[data['app_name'] == app_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:7]  # Top 6 juegos similares, incluido él mismo
    game_indices = [i[0] for i in sim_scores]
    return data['app_name'].iloc[game_indices]