# ETL (Ingeniería de Datos)

### Importar Librerías

In [1]:
from warnings import filterwarnings

# Ignorar todas las advertencias
filterwarnings("ignore")

import pandas as pd
import sys
import os

In [2]:
# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Navegar hacia el directorio raíz del proyecto
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Agregar la ruta del proyecto al sys.path
sys.path.append(project_root)

Importar funciones para ETL (funciones creadas en la carpeta function, archivo ETL)

In [3]:
from functions.ETL import load_data, normalize, export # (funciones creadas en la carpeta function, archivo ETL)

## Extracción de datos

Ruta del archivo:

In [4]:
path = r'..\data\users_items.json.gz'

Extracción y Visualización de datos

In [5]:
df = load_data(path) # (funciones creadas en la carpeta function, archivo ETL)
df.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


## Transformación de los Datos

In [6]:
df.isna().sum()

user_id        0
items_count    0
steam_id       0
user_url       0
items          0
dtype: int64

Desanidación de la columna 'items' con la función normalize(df, col)

In [7]:
items = normalize(df, 'items') # (funciones creadas en la carpeta function, archivo ETL)
items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0.0,0.0


In [8]:
items.isna().sum() 
# los valores nulos coinciden con los usuarios con 0 items y sin tiempo jugado

user_id                 0
items_count             0
steam_id                0
user_url                0
item_id             16806
item_name           16806
playtime_forever    16806
playtime_2weeks     16806
dtype: int64

In [9]:
# serciorarse que no haya inconsistencia entre los tiempos de playtime
items[items['playtime_2weeks'] > items['playtime_forever']]

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
6406,devvonst,68,76561198062380099,http://steamcommunity.com/id/devvonst,17390,Spore,110.0,111.0
23668,PabloSanches,116,76561198047569011,http://steamcommunity.com/id/PabloSanches,105600,Terraria,579.0,759.0
41636,76561198066465411,38,76561198066465411,http://steamcommunity.com/profiles/76561198066...,462930,AdVenture Communist,473.0,474.0
42930,Albolah,11,76561198127777505,http://steamcommunity.com/id/Albolah,245470,Democracy 3,3180.0,3187.0
58092,76561198095380303,68,76561198095380303,http://steamcommunity.com/profiles/76561198095...,346110,ARK: Survival Evolved,8140.0,8168.0
...,...,...,...,...,...,...,...,...
4240375,76561198038659583,97,76561198038659583,http://steamcommunity.com/profiles/76561198038...,49520,Borderlands 2,1784.0,1812.0
4341163,76561198044420779,17,76561198044420779,http://steamcommunity.com/profiles/76561198044...,386360,SMITE,642.0,650.0
4412813,cholamaaisdaquindaprateouvir,36,76561198048478953,http://steamcommunity.com/id/cholamaaisdaquind...,57690,Tropico 4,11.0,17.0
4767770,vortixia,114,76561198070728487,http://steamcommunity.com/id/vortixia,359580,Uncanny Valley,11.0,13.0


In [10]:
# Si playtime_forever tiene valores nulos los reemplaza con los valores de playtime_2weeks si no son nulos
# hay valores de playtime_forever que tiene valores inferiores a los valores de playtime_2weeks
# optaré por reemplazar los valores de playtime_forever en vez de sumarlos.
items['playtime_forever'] = items.apply(lambda row: row['playtime_2weeks'] if row['playtime_forever'] < row['playtime_2weeks'] else row['playtime_forever'], axis=1)

In [11]:
# volver a buscar inconsistencias
items[items['playtime_2weeks'] > items['playtime_forever']]

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks


### Eliminar columnas que no sirven

In [19]:
# Seleccionar las columnas irrelevantes, eliminarlas y visualizar
cols = ['steam_id', 'user_url', 'item_name', 'playtime_2weeks']
items = items.drop(columns=cols, errors='ignore')
items.head()

user_id                 0
items_count             0
item_id             16806
playtime_forever    16806
dtype: int64

Visualización de tipos de datos

In [20]:
# visualización de tipos de datos y datos nulos
[print(c, items[c].dtype) for c in items.columns]
items.isna().sum()

user_id object
items_count int64
item_id object
playtime_forever float64


user_id                 0
items_count             0
item_id             16806
playtime_forever    16806
dtype: int64

In [14]:
# se eliminan las filas donde item_id es nulo, las cuales coinciden con las nulas de playtime_forever nulas
items.dropna(inplace=True)

In [15]:
# se visualiza la cantidad de filas final
len(items)

5153209

## Carga de Datos

Se guarda el archivo trabajado en formato parquet y CSV en sus carpetas correspondientes para ser trabajados de acuerdo a la situación. Si los directorios no existen, se crean.

In [16]:
export(items, project_root, 'user_items') # Función de exportación en ./functions/ETL.py

Archivos exportados exitosamente.
