# Convertir valores a Dummies para entrenamiento de modelo

Para llegar a este paso es importante haber realizado todos los pasos previos en orden (si se desea revisar todo en orden):
1. Ejecutar todos los archivos de la carpeta ETL sin importar el orden.
2. Ejecutar el archivo ../Feature Engineering/sentiment_analysis.ipynb
3. Ejecutar mínimo el archivo ../EDA/EDA.ipynb (un ETL final, los otros archivos permiten realizar una visualización del comportamiento de los datos).

## Importar Librerías

In [18]:
import os
import sys
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Ignorar advertencias
warnings.filterwarnings('ignore')

In [19]:
# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Navegar hacia el directorio raíz del proyecto
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Agregar la ruta del proyecto al sys.path
sys.path.append(project_root)

In [20]:
# importar función personalizada de ./VideoGameRecommender/functions/EDA.py
# Esta función permite traer el archivo en el formato que se encuentre (CSV o Parquet)
from functions.EDA import get_file
from functions.ETL import export

## Revisiones previas

In [21]:
# se utiliza la función get_files() del archivo ../functions/EDA.py
df = get_file('games')

# visualizar
df.head()

Unnamed: 0,id,app_name,genres,specs,price,developer,release_year,playtime_total,recommend_pos,recommend_neg,review_neg,review_neu,review_pos
0,761140,Lost Summoner Kitty,action,single-player,4.99,kotoshiro,2018.0,0.0,0.0,0.0,0.0,0.0,0.0
1,761140,Lost Summoner Kitty,simulation,single-player,4.99,kotoshiro,2018.0,0.0,0.0,0.0,0.0,0.0,0.0
2,761140,Lost Summoner Kitty,indie,single-player,4.99,kotoshiro,2018.0,0.0,0.0,0.0,0.0,0.0,0.0
3,761140,Lost Summoner Kitty,casual,single-player,4.99,kotoshiro,2018.0,0.0,0.0,0.0,0.0,0.0,0.0
4,761140,Lost Summoner Kitty,strategy,single-player,4.99,kotoshiro,2018.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# convertir los datos de playtime_total a int64
df['playtime_total'] = df['playtime_total'].astype('int64')

# convertir los datos de release_year a int16
df['release_year'] = df['release_year'].astype('int16')

# convertir los datos de recommend_pos, recommend_neg, review_neg, review_neu y review_pos a int
df['recommend_pos'] = df['recommend_pos'].astype('int16')
df['review_neu'] = df['review_neu'].astype('int16')
df['recommend_neg'] = df['recommend_neg'].astype('int16')
df['review_neg'] = df['review_neg'].astype('int16')
df['review_pos'] = df['review_pos'].astype('int16')

In [23]:
#Se reduce levemente el dataset eliminando desarrolladores con menos de 10 juegos
developer_counts = df['developer'].value_counts()

# Mantener solo desarrolladores con más de 10 juegos
common_developers = developer_counts[developer_counts > 10].index
df = df[df['developer'].isin(common_developers)]

# contar cantidad de registros
len(df)

164149

In [24]:
# ver tipos de datos
print(df.isna().sum())
df.info()

id                   0
app_name             0
genres             420
specs             5099
price              446
developer            0
release_year         0
playtime_total       0
recommend_pos        0
recommend_neg        0
review_neg           0
review_neu           0
review_pos           0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 164149 entries, 5 to 198486
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              164149 non-null  int64  
 1   app_name        164149 non-null  object 
 2   genres          163729 non-null  object 
 3   specs           159050 non-null  object 
 4   price           163703 non-null  float64
 5   developer       164149 non-null  object 
 6   release_year    164149 non-null  int16  
 7   playtime_total  164149 non-null  int64  
 8   recommend_pos   164149 non-null  int16  
 9   recommend_neg   164149 non-null  int16  
 10  review_neg      164149 no

In [25]:
# eliminar filas donde tanto genres, specs y price sean nulos
df.dropna(subset=['genres', 'specs', 'price'], how='all', inplace=True)
print(df.isna().sum())

id                   0
app_name             0
genres             406
specs             5085
price              432
developer            0
release_year         0
playtime_total       0
recommend_pos        0
recommend_neg        0
review_neg           0
review_neu           0
review_pos           0
dtype: int64


In [26]:
# Tratar nulos. Condición para identificar las filas a eliminar
condition = (
    (df['playtime_total'] == 0) &
    (df['recommend_pos'] == 0) &
    (df['recommend_neg'] == 0) &
    (df['review_neg'] == 0) &
    (df['review_neu'] == 0) &
    (df['review_pos'] == 0)
)

# Eliminar filas donde las columnas especificadas son nulas y las condiciones son verdaderas
df_cleaned = df[~(condition & df[['genres', 'specs', 'price']].isnull().any(axis=1))]

# Verifica los resultados
df_cleaned.isna().sum()

id                  0
app_name            0
genres              6
specs             906
price             208
developer           0
release_year        0
playtime_total      0
recommend_pos       0
recommend_neg       0
review_neg          0
review_neu          0
review_pos          0
dtype: int64

In [27]:
#convertir las columnas genres, specs y developer a string
df['genres'] = df['genres'].astype('str')
df['specs'] = df['specs'].astype('str')

In [28]:
# Eliminar developer
df.drop(['developer'], axis=1, inplace=True)
df.drop_duplicates(inplace=True)
df.head(3)

Unnamed: 0,id,app_name,genres,specs,price,release_year,playtime_total,recommend_pos,recommend_neg,review_neg,review_neu,review_pos
5,643980,Ironbound,tactical,multi-player,0.0,2018,0,0,0,0,0,0
6,643980,Ironbound,tactical,cross-platform multiplayer,0.0,2018,0,0,0,0,0,0
7,643980,Ironbound,tactical,single-player,0.0,2018,0,0,0,0,0,0


## Dummies

In [29]:
# agrupar por id concatenando genres y specs
redux = df.groupby(['id', 'price', 'release_year', 'playtime_total', 'recommend_pos', 'recommend_neg', 'review_neg', 'review_neu', 'review_pos'], as_index=False).agg({
    'genres': lambda x: sorted(set(x)),
    'specs': lambda x: sorted(set(x)),
})

redux.head(3)

Unnamed: 0,id,price,release_year,playtime_total,recommend_pos,recommend_neg,review_neg,review_neu,review_pos,genres,specs
0,10,9.99,2000,17386015,56,1,0,4,53,"[action, classic, competitive, e-sports, first...",[multi-player]
1,20,4.99,1999,961702,11,6,2,4,11,"[action, adventure, casual, class-based, class...",[multi-player]
2,30,4.99,2003,758991,3,1,0,1,3,"[action, class-based, classic, co-op, first-pe...",[multi-player]


In [30]:
# crear dummies para genres y specs
genre_dummies = redux['genres'].explode().str.get_dummies().groupby(level=0).sum().reset_index()
specs_dummies = redux['specs'].explode().str.get_dummies().groupby(level=0).sum().reset_index()

# concatenar las columnas categóricas a los datos originales y eliminar las columnas de genres y specs
dummies = pd.concat([redux, genre_dummies], axis=1).drop('genres', axis=1)
dummies = pd.concat([dummies, specs_dummies], axis=1).drop('specs', axis=1)

In [31]:
dummies.head()

Unnamed: 0,id,price,release_year,playtime_total,recommend_pos,recommend_neg,review_neg,review_neu,review_pos,index,...,visual novel,war,wrestling,index.1,cross-platform multiplayer,local multi-player,multi-player,nan,online multi-player,single-player
0,10,9.99,2000,17386015,56,1,0,4,53,0,...,0,0,0,0,0,0,1,0,0,0
1,20,4.99,1999,961702,11,6,2,4,11,1,...,0,0,0,1,0,0,1,0,0,0
2,30,4.99,2003,758991,3,1,0,1,3,2,...,0,1,0,2,0,0,1,0,0,0
3,40,4.99,2001,154486,1,0,0,0,1,3,...,0,0,0,3,0,0,1,0,0,0
4,50,4.99,1999,734562,3,1,0,1,3,4,...,0,0,0,4,0,0,1,0,0,1


In [32]:
# llenar dummies vacíos con 0
dummies.fillna(0, inplace=True)

In [33]:
# Combinar las columnas duplicadas
dummies = dummies.groupby(dummies.columns, axis=1).agg(lambda x: x.ffill().bfill().iloc[:, 0])

In [34]:
# exportar los dummies con la función export en ../functions/ETL.py
export(dummies, project_root, 'dummies')

Archivos exportados exitosamente.
