In [1]:
import pandas as pd
import pyarrow.parquet as pq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [2]:
df_steam_games = pq.read_table('df_steam_games_limpio_final.parquet').to_pandas()

In [3]:
# Selección de columnas relevantes
data = df_steam_games[['id', 'app_name', 'genres', 'tags', 'specs']]

In [4]:
# Limpieza y preprocesamiento de texto
data['combined_features'] = data['genres'] + ' ' + data['tags'] + ' ' + data['specs']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['combined_features'] = data['genres'] + ' ' + data['tags'] + ' ' + data['specs']


In [5]:
data.shape

(2020050, 6)

In [6]:
data = data.groupby('id').first().reset_index()

In [7]:
data = data[data['id']!=0]

In [8]:
data['app_name'].nunique()

30708

In [9]:
data = data.sample(n=3500)

In [10]:
data.shape

(3500, 6)

In [11]:
data.reset_index(drop=True, inplace=True)

In [12]:
data

Unnamed: 0,id,app_name,genres,tags,specs,combined_features
0,446610,bunny bounce,adventure,nudity,single player,adventure nudity single player
1,382350,devils bluff,action,indie,multi player,action indie multi player
2,723880,new world: the tupis,action,early access,single player,action early access single player
3,80000,apox,action,strategy,single player,action strategy single player
4,620780,fantasy grounds blue dragons (token pack),indie,strategy,multi player,indie strategy multi player
...,...,...,...,...,...,...
3495,550335,arrange costume set elphelt,action,action,single player,action action single player
3496,706210,haul asteroid,action,indie,single player,action indie single player
3497,733230,cyber arena,none,action,single player,none action single player
3498,645690,fantasia of the wind 风之幻想曲,adventure,adventure,single player,adventure adventure single player


In [13]:
data.head(10)

Unnamed: 0,id,app_name,genres,tags,specs,combined_features
0,446610,bunny bounce,adventure,nudity,single player,adventure nudity single player
1,382350,devils bluff,action,indie,multi player,action indie multi player
2,723880,new world: the tupis,action,early access,single player,action early access single player
3,80000,apox,action,strategy,single player,action strategy single player
4,620780,fantasy grounds blue dragons (token pack),indie,strategy,multi player,indie strategy multi player
5,341440,banzai pecan: the last hope for the young century,action,action,single player,action action single player
6,741090,mountains clouds jewel match,adventure,strategy,single player,adventure strategy single player
7,533601,riders of icarus: heroic grim riders package,adventure,adventure,single player,adventure adventure single player
8,346440,musclecar online,casual,racing,multi player,casual racing multi player
9,291690,empathy: path of whispers,adventure,adventure,single player,adventure adventure single player


In [14]:
data['app_name'].value_counts()

app_name
colony                           2
bunny bounce                     1
m.a.c.e. tower defense           1
reus                             1
moshe kasher: live in oakland    1
                                ..
supermagical                     1
fate of the world: denial        1
olsons boxing challenge          1
rogue warrior                    1
where is my heart                1
Name: count, Length: 3499, dtype: int64

In [16]:
import pyarrow.parquet as pq
data.to_parquet('df_reconocimiento.parquet')

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
CV = CountVectorizer()
converted_metrix = CV.fit_transform(data['combined_features'])

In [19]:
cosine_similarity = cosine_similarity(converted_metrix)

In [21]:
def get_recommendations(app_name, cosine_sim=cosine_similarity ):
    idx = data[data['app_name'] == app_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 10 juegos similares
    game_indices = [i[0] for i in sim_scores]
    return data['app_name'].iloc[game_indices]



In [23]:
# guardo la matriz

import joblib #pickle es alternativa

joblib.dump(cosine_similarity, 'Matriz.pkl', compress=1)

['Matriz.pkl']

In [25]:
import joblib
with open('Matriz.pkl', 'rb') as file:
        modelo = joblib.load(file)

data = pd.read_parquet('df_reconocimiento.parquet')

In [26]:
data.shape

(3500, 6)

In [27]:
recommendations = get_recommendations('colony')
print(recommendations)

40     moe mekuri sp  moe mekuri 1 vocal collection 
83                                     wobbly jungle
95                             the depths of tolagal
98                         madballs in babo:invasion
120                                   warlocks tower
Name: app_name, dtype: object
