# Convertir valores a Dummies para entrenamiento de modelo

Para llegar a este paso es importante haber realizado todos los pasos previos en orden (si se desea revisar todo en orden):
1. Ejecutar todos los archivos de la carpeta ETL sin importar el orden.
2. Ejecutar el archivo ../Feature Engineering/sentiment_analysis.ipynb
3. Ejecutar mínimo el archivo ../EDA/EDA.ipynb (un ETL final, los otros archivos permiten realizar una visualización del comportamiento de los datos).

## Importar Librerías

In [1]:
import os
import sys
import pandas as pd
import warnings
import gzip
import numpy as np

# Ignorar advertencias
warnings.filterwarnings('ignore')

In [2]:
# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Navegar hacia el directorio raíz del proyecto
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Agregar la ruta del proyecto al sys.path
sys.path.append(project_root)

In [3]:
# importar función personalizada de ./VideoGameRecommender/functions/EDA.py
# Esta función permite traer el archivo en el formato que se encuentre (CSV o Parquet)
from functions.EDA import get_file
from functions.ETL import export

## Revisiones previas

In [4]:
# se utiliza la función get_files() del archivo ../functions/EDA.py
df = get_file('games')

# visualizar
df.head()

Unnamed: 0,id,app_name,genres,specs,price,developer,release_year,playtime_total,recommend_pos,recommend_neg,review_neg,review_neu,review_pos
0,282010,Carmageddon Max Pack,classic,multi-player,9.99,stainless,1997.0,466.0,1.0,0.0,0.0,0.0,1.0
1,282010,Carmageddon Max Pack,classic,single-player,9.99,stainless,1997.0,466.0,1.0,0.0,0.0,0.0,1.0
2,282010,Carmageddon Max Pack,indie,multi-player,9.99,stainless,1997.0,466.0,1.0,0.0,0.0,0.0,1.0
3,282010,Carmageddon Max Pack,indie,single-player,9.99,stainless,1997.0,466.0,1.0,0.0,0.0,0.0,1.0
4,282010,Carmageddon Max Pack,action,multi-player,9.99,stainless,1997.0,466.0,1.0,0.0,0.0,0.0,1.0


In [5]:
# convertir los datos de playtime_total a int64
df['playtime_total'] = df['playtime_total'].astype('int64')

# convertir los datos de release_year a int16
df['release_year'] = df['release_year'].astype('int16')

# convertir los datos de recommend_pos, recommend_neg, review_neg, review_neu y review_pos a int
df['recommend_pos'] = df['recommend_pos'].astype('int16')
df['review_neu'] = df['review_neu'].astype('int16')
df['recommend_neg'] = df['recommend_neg'].astype('int16')
df['review_neg'] = df['review_neg'].astype('int16')
df['review_pos'] = df['review_pos'].astype('int16')

In [6]:
#Se reduce levemente el dataset eliminando desarrolladores con menos de 10 juegos
developer_counts = df['developer'].value_counts()

# Mantener solo desarrolladores con más de 10 juegos
common_developers = developer_counts[developer_counts > 10].index
df = df[df['developer'].isin(common_developers)]

# contar cantidad de registros
len(df)

29287

In [7]:
# ver tipos de datos
print(df.isna().sum())
# se eliminan todos los nulos por el bien de render
df.dropna(inplace=True)
df.info()

id                0
app_name          0
genres            0
specs             0
price             0
developer         0
release_year      0
playtime_total    0
recommend_pos     0
recommend_neg     0
review_neg        0
review_neu        0
review_pos        0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 29287 entries, 0 to 35248
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              29287 non-null  int64  
 1   app_name        29287 non-null  object 
 2   genres          29287 non-null  object 
 3   specs           29287 non-null  object 
 4   price           29287 non-null  float64
 5   developer       29287 non-null  object 
 6   release_year    29287 non-null  int16  
 7   playtime_total  29287 non-null  int64  
 8   recommend_pos   29287 non-null  int16  
 9   recommend_neg   29287 non-null  int16  
 10  review_neg      29287 non-null  int16  
 11  review_neu      29287 non-null  i

In [8]:
# eliminar filas donde tanto genres, specs y price sean nulos
df.dropna(subset=['genres', 'specs', 'price'], how='all', inplace=True)
print(df.isna().sum())

id                0
app_name          0
genres            0
specs             0
price             0
developer         0
release_year      0
playtime_total    0
recommend_pos     0
recommend_neg     0
review_neg        0
review_neu        0
review_pos        0
dtype: int64


In [9]:
# Tratar nulos. Condición para identificar las filas a eliminar
condition = (
    (df['playtime_total'] == 0) &
    (df['recommend_pos'] == 0) &
    (df['recommend_neg'] == 0) &
    (df['review_neg'] == 0) &
    (df['review_neu'] == 0) &
    (df['review_pos'] == 0)
)

# Eliminar filas donde las columnas especificadas son nulas y las condiciones son verdaderas
df_cleaned = df[~(condition & df[['genres', 'specs', 'price']].isnull().any(axis=1))]

# Verifica los resultados
df_cleaned.isna().sum()

id                0
app_name          0
genres            0
specs             0
price             0
developer         0
release_year      0
playtime_total    0
recommend_pos     0
recommend_neg     0
review_neg        0
review_neu        0
review_pos        0
dtype: int64

In [10]:
#convertir las columnas genres, specs y developer a string
df['genres'] = df['genres'].astype('str')
df['specs'] = df['specs'].astype('str')

In [11]:
# Eliminar developer
df.drop(['developer'], axis=1, inplace=True)
df.drop_duplicates(inplace=True)
df.head(3)

Unnamed: 0,id,app_name,genres,specs,price,release_year,playtime_total,recommend_pos,recommend_neg,review_neg,review_neu,review_pos
0,282010,Carmageddon Max Pack,classic,multi-player,9.99,1997,466,1,0,0,0,1
1,282010,Carmageddon Max Pack,classic,single-player,9.99,1997,466,1,0,0,0,1
2,282010,Carmageddon Max Pack,indie,multi-player,9.99,1997,466,1,0,0,0,1


## Dummies

In [12]:
# agrupar por id concatenando genres y specs
redux = df.groupby(['id', 'price', 'release_year', 'playtime_total', 'recommend_pos', 'recommend_neg', 'review_neg', 'review_neu', 'review_pos'], as_index=False).agg({
    'genres': lambda x: sorted(set(x)),
    'specs': lambda x: sorted(set(x)),
})

redux.head(3)

Unnamed: 0,id,price,release_year,playtime_total,recommend_pos,recommend_neg,review_neg,review_neu,review_pos,genres,specs
0,10,9.99,2000,747975,44,1,0,3,42,"[action, classic, competitive, e-sports, first...",[multi-player]
1,20,4.99,1999,72492,10,5,2,3,10,"[action, adventure, casual, class-based, class...",[multi-player]
2,30,4.99,2003,2392,3,1,0,1,3,"[action, class-based, classic, co-op, first-pe...",[multi-player]


In [13]:
# crear dummies para genres y specs
genre_dummies = redux['genres'].explode().str.get_dummies().groupby(level=0).sum().reset_index()
specs_dummies = redux['specs'].explode().str.get_dummies().groupby(level=0).sum().reset_index()

# concatenar las columnas categóricas a los datos originales y eliminar las columnas de genres y specs
dummies = pd.concat([redux, genre_dummies], axis=1).drop('genres', axis=1)
dummies = pd.concat([dummies, specs_dummies], axis=1).drop('specs', axis=1)

In [14]:
dummies.head()

Unnamed: 0,id,price,release_year,playtime_total,recommend_pos,recommend_neg,review_neg,review_neu,review_pos,index,...,utilities,visual novel,war,wrestling,index.1,cross-platform multiplayer,local multi-player,multi-player,online multi-player,single-player
0,10,9.99,2000,747975,44,1,0,3,42,0,...,0,0,0,0,0,0,0,1,0,0
1,20,4.99,1999,72492,10,5,2,3,10,1,...,0,0,0,0,1,0,0,1,0,0
2,30,4.99,2003,2392,3,1,0,1,3,2,...,0,0,1,0,2,0,0,1,0,0
3,40,4.99,2001,145,1,0,0,0,1,3,...,0,0,0,0,3,0,0,1,0,0
4,50,4.99,1999,1234,2,1,0,1,2,4,...,0,0,0,0,4,0,0,1,0,1


In [15]:
# llenar dummies vacíos con 0
dummies.fillna(0, inplace=True)

In [16]:
# Combinar las columnas duplicadas
dummies = dummies.groupby(dummies.columns, axis=1).agg(lambda x: x.ffill().bfill().iloc[:, 0])

In [17]:

# exportar los dummies con la función export en ../functions/ETL.py
export(dummies, project_root, 'dummies')

Archivos exportados exitosamente.
