# ETL (Ingeniería de Datos)

### Importar Librerías

In [1]:
import warnings

# Ignorar todas las advertencias
warnings.filterwarnings("ignore")

import pandas as pd
import sys
import os

In [2]:
# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Navegar hacia el directorio raíz del proyecto
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Agregar la ruta del proyecto al sys.path
sys.path.append(project_root)

Importar funciones para ETL (funciones creadas en la carpeta function, archivo ETL)

In [3]:
from functions.ETL import load_data, normalize, export # (funciones creadas en la carpeta function, archivo ETL)

## Extracción de datos

Ruta del archivo:

In [4]:
path = r'..\data\user_reviews.json.gz'

Extracción y Visualización de datos

In [5]:
df = load_data(path)# (funciones creadas en la carpeta function, archivo ETL)
df.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


## Transformación de los Datos

Normalización del DataFrame:

Desanidación de la columna 'reviews' con la función normalize(df, col)

In [6]:
reviews = normalize(df, 'reviews') # (funciones creadas en la carpeta function, archivo ETL)
reviews.head(3)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...


Comprobar tipos de datos

In [7]:
[print(c, reviews[c].dtype) for c in reviews.columns]

user_id object
user_url object
funny object
posted object
last_edited object
item_id object
helpful object
recommend object
review object


[None, None, None, None, None, None, None, None, None]

In [8]:
# Contar valores nulos. los 28 valores nulos corresponden a usuarios que no han realizado reviews.
reviews.isna().sum()

user_id         0
user_url        0
funny          28
posted         28
last_edited    28
item_id        28
helpful        28
recommend      28
review         28
dtype: int64

### Eliminar Columnas

In [9]:
# guardamos los nmbres de las columnas a eliminar, las eliminamos y las visualizamos
cols = ['user_url', 'funny', 'last_edited', 'helpful']

reviews.drop(columns=cols, inplace=True)
reviews.head(3)

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.
2,76561197970982479,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...


### Eiminar filas cuyo único dato es user_id

In [10]:
# Identificar filas que cumplen con la condición
rows_to_drop = reviews[['posted', 'item_id', 'recommend', 'review']].isna().all(axis=1)

# Eliminar las filas que cumplen con la condición y visualizar valores nulos
reviews.drop(index=reviews[rows_to_drop].index, inplace=True)
reviews.isna().sum()

user_id      0
posted       0
item_id      0
recommend    0
review       0
dtype: int64

### Normalizar columna posted para darle formato de fecha

In [11]:
# Reemplazar los substrings 'Posted ' y '.' por '' y visualizar
reviews['posted'] = reviews['posted'].str.replace('Posted ', '').str.replace('.', '')
reviews.head(3)

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"November 5, 2011",1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,"July 15, 2011",22200,True,It's unique and worth a playthrough.
2,76561197970982479,"April 21, 2011",43110,True,Great atmosphere. The gunplay can be a bit chu...


In [12]:
# Detectar las fechas incompletas (sin año)
reviews[reviews['posted'].str.len() < 11]

Unnamed: 0,user_id,posted,item_id,recommend,review
6,evcentric,February 3,248820,True,A suitably punishing roguelike platformer. Wi...
27,76561198079601835,May 20,730,True,ZIKA DO BAILE
28,MeaTCompany,July 24,730,True,BEST GAME IN THE BLOODY WORLD
31,76561198156664158,June 16,252950,True,love it
32,76561198077246154,June 11,440,True,mt bom
...,...,...,...,...,...
59328,76561198312638244,July 10,70,True,a must have classic from steam definitely wort...
59329,76561198312638244,July 8,362890,True,this game is a perfect remake of the original ...
59330,LydiaMorley,July 3,273110,True,had so much fun plaing this and collecting res...
59331,LydiaMorley,July 20,730,True,:D


In [13]:
# Crear una columna con datos booleanos indicando si la fecha está completa o incompleta
reviews['incomplete_date'] = reviews['posted'].apply(lambda x: len(x.split()) == 2 if x else False)
reviews.head(3)

Unnamed: 0,user_id,posted,item_id,recommend,review,incomplete_date
0,76561197970982479,"November 5, 2011",1250,True,Simple yet with great replayability. In my opi...,False
1,76561197970982479,"July 15, 2011",22200,True,It's unique and worth a playthrough.,False
2,76561197970982479,"April 21, 2011",43110,True,Great atmosphere. The gunplay can be a bit chu...,False


In [14]:
# Extraer años de las filas que tienen una fecha completa
# Usar regex para encontrar el año dentro del string de 'posted'
reviews['year'] = reviews['posted'].str.extract(r'(\d{4})').astype(float)

# Encontrar el último año registrado
last_year = int(reviews['year'].max())
last_year

2015

In [15]:
# Llenar las fechas incompletas ('incomplete_date' True) con el año siguiente
reviews.loc[reviews['incomplete_date'], 'posted'] = reviews.loc[reviews['incomplete_date'], 'posted'] + f", {last_year + 1}"

# Verificar las actualizaciones
reviews[reviews['incomplete_date']].head(3)

Unnamed: 0,user_id,posted,item_id,recommend,review,incomplete_date,year
6,evcentric,"February 3, 2016",248820,True,A suitably punishing roguelike platformer. Wi...,True,
27,76561198079601835,"May 20, 2016",730,True,ZIKA DO BAILE,True,
28,MeaTCompany,"July 24, 2016",730,True,BEST GAME IN THE BLOODY WORLD,True,


In [16]:
# cambiar el formato de posted a tipo fecha y visualizar
reviews['posted'] = pd.to_datetime(reviews['posted'])
reviews.head(3)

Unnamed: 0,user_id,posted,item_id,recommend,review,incomplete_date,year
0,76561197970982479,2011-11-05,1250,True,Simple yet with great replayability. In my opi...,False,2011.0
1,76561197970982479,2011-07-15,22200,True,It's unique and worth a playthrough.,False,2011.0
2,76561197970982479,2011-04-21,43110,True,Great atmosphere. The gunplay can be a bit chu...,False,2011.0


In [17]:
# Eliminar columna incomplete_date y year y visualizar
reviews.drop(columns=['incomplete_date', 'year'], inplace=True)
reviews.head(3)

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,2011-11-05,1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,2011-07-15,22200,True,It's unique and worth a playthrough.
2,76561197970982479,2011-04-21,43110,True,Great atmosphere. The gunplay can be a bit chu...


### Reemplazar valores en la columna recommend (True = 1, False = 0)

In [18]:
# reemplazar los valores de la columna recommend y passarlo a valores numéricos
reviews['recommend'] = reviews['recommend'].astype(int)
reviews.head(3)

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,2011-11-05,1250,1,Simple yet with great replayability. In my opi...
1,76561197970982479,2011-07-15,22200,1,It's unique and worth a playthrough.
2,76561197970982479,2011-04-21,43110,1,Great atmosphere. The gunplay can be a bit chu...


### Terminar definición de tipos de datos

In [19]:
# visualizar ti
[print(c, reviews[c].dtype) for c in reviews.columns]

user_id object
posted datetime64[ns]
item_id object
recommend int64
review object


[None, None, None, None, None]

In [20]:
# convertir la columna item_id a int, y review y user_id a str
reviews['item_id'] = reviews['item_id'].astype(int)
reviews['review'] = reviews['review'].astype(str)
reviews['user_id'] = reviews['user_id'].astype(str)

# visualizar tipos de datoa
[print(c, reviews[c].dtype) for c in reviews.columns]

user_id object
posted datetime64[ns]
item_id int64
recommend int64
review object


[None, None, None, None, None]

## Carga de Datos

Se guarda el archivo trabajado en formato parquet y CSV en sus carpetas correspondientes para ser trabajados de acuerdo a la situación. Si los directorios no existen, se crean.

In [21]:
# Exportar el DataFrame a un archivo parquet y CSV.
# Función export() en ./functions/ETL.js
export(reviews, project_root, 'user_reviews')

Archivos exportados exitosamente.
