In [1]:
import pandas as pd
import pyarrow.parquet as pq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [2]:
df_steam_games = pq.read_table('df_steam_games_limpio_final.parquet').to_pandas()

In [3]:
# Selección de columnas relevantes
data = df_steam_games[['id', 'app_name', 'genres', 'tags', 'specs']]

In [4]:
# Limpieza y preprocesamiento de texto
data['combined_features'] = data['genres'] + ' ' + data['tags'] + ' ' + data['specs']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['combined_features'] = data['genres'] + ' ' + data['tags'] + ' ' + data['specs']


In [5]:
data.shape

(2020050, 6)

In [6]:
data = data.groupby('id').first().reset_index()

In [7]:
data = data[data['id']!=0]

In [8]:
data['app_name'].nunique()

30708

In [9]:
data = data.sample(n=3500)

In [10]:
data.shape

(3500, 6)

In [11]:
data.reset_index(drop=True, inplace=True)

In [12]:
data

Unnamed: 0,id,app_name,genres,tags,specs,combined_features
0,509711,rocksmith® 2014 edition – remastered – evanesc...,casual,casual,single player,casual casual single player
1,647460,adventures in comedy,none,movie,captions available,none movie captions available
2,669340,the royal cosmonautical society,action,adventure,single player,action adventure single player
3,643460,tane dlc: bnsf gp38 2 pumpkins (2 pack),simulation,simulation,single player,simulation simulation single player
4,261490,bedlam,action,action,single player,action action single player
...,...,...,...,...,...,...
3495,670130,jam studio vr,none,casual,single player,none casual single player
3496,15310,the settlers®: heritage of kings,strategy,strategy,single player,strategy strategy single player
3497,357370,return null episode 1,indie,interactive fiction,single player,indie interactive fiction single player
3498,647740,airmen,action,action,multi player,action action multi player


In [13]:
data.head(10)

Unnamed: 0,id,app_name,genres,tags,specs,combined_features
0,509711,rocksmith® 2014 edition – remastered – evanesc...,casual,casual,single player,casual casual single player
1,647460,adventures in comedy,none,movie,captions available,none movie captions available
2,669340,the royal cosmonautical society,action,adventure,single player,action adventure single player
3,643460,tane dlc: bnsf gp38 2 pumpkins (2 pack),simulation,simulation,single player,simulation simulation single player
4,261490,bedlam,action,action,single player,action action single player
5,296210,brigade e5: new jagged union,simulation,strategy,single player,simulation strategy single player
6,444290,28 waves later,action,action,single player,action action single player
7,206152,rocksmith finger eleven paralyzer,casual,casual,single player,casual casual single player
8,406920,monsterland,action,indie,single player,action indie single player
9,487370,akin,casual,casual,single player,casual casual single player


In [14]:
data['app_name'].value_counts()

app_name
rocksmith® 2014 edition – remastered – evanescence   “going under”    1
madballs campaign deserters pack                                      1
slimebrawl                                                            1
ski sport: jumping vr                                                 1
marvel vs. capcom: infinite   ryu wanderer costume                    1
                                                                     ..
rocksmith® 2014 – tom petty   “i won’t back down”                     1
rocksmith® 2014 edition – remastered – halestorm   “mz. hyde”         1
darkest hour: europe 44 45                                            1
gearcity                                                              1
displayfusion                                                         1
Name: count, Length: 3500, dtype: int64

In [15]:
import pyarrow.parquet as pq
data.to_parquet('df_endopoint6.parquet')

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
CV = CountVectorizer()
converted_metrix = CV.fit_transform(data['combined_features'])

In [18]:
cosine_similarity = cosine_similarity(converted_metrix)

In [19]:
def get_recommendations(app_name, cosine_sim=cosine_similarity ):
    idx = data[data['app_name'] == app_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 10 juegos similares
    game_indices = [i[0] for i in sim_scores]
    return data['app_name'].iloc[game_indices]



In [20]:
# guardo la matriz

import joblib #pickle es alternativa

joblib.dump(cosine_similarity, 'Matriz1.pkl', compress=1)

['Matriz1.pkl']

In [23]:
import joblib
with open('Matriz1.pkl', 'rb') as file:
        modelo = joblib.load(file)

data = pd.read_parquet('df_endpoint6.parquet')

In [24]:
data.shape

(3500, 6)

In [26]:
recommendations = get_recommendations('marvel vs. capcom: infinite   ryu wanderer costume')
print(recommendations)

6                              28 waves later
10                              fleeting ages
12                  tomb raider: pistol burst
13               kung fu panda: panda village
41    dead rising 2   soldier of fortune pack
Name: app_name, dtype: object
