# ETL - User items

In [1]:
#Se importan las librerías a necesarias.
import gzip
import ast
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq

### Abrimos el archivo

In [2]:
items_json = 'C:\\Users\\fedez\\OneDrive\\Escritorio\\PI-MLOps\\users_items.json.gz'
#Se crea una lista vacía para almacenar los datos en ella.
items = []
with gzip.open(items_json, 'rt', encoding='utf-8') as archivo:
    #Se crea un bucle for para incorporar las filas del archivo a la lista.
    for line in archivo.readlines():
        items.append(ast.literal_eval(line))

In [3]:
df_items = pd.DataFrame(items)
df_items

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [4]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


### Explotación de la columna "items".

In [5]:
# Explotamos el archivo en la columna de items para comenzar a desanidar.
df_items_explotado = df_items.explode('items')
# Normalizamos la columna.
df_items_normalizado = pd.json_normalize(df_items_explotado['items'])
# Eliminamos la columna items de la primera variable para luego volver a concatenar ambos dataframes.
df_drop_items = df_items_explotado.drop(['items'], axis=1)
# Reseteamos los índices para compatibilizar los dataframes.
df_drop_items.reset_index(inplace=True, drop=True)
df_items_normalizado.reset_index(inplace=True, drop=True)
# Volvemos a concatenar ambos dataframes.
df_items = pd.concat([df_drop_items, df_items_normalizado], axis=1)
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0.0,0.0


### Eliminación de columnas innecesarias.

In [6]:
# Se procede a eliminar las columnas "steam_id", "user_url", "item_name" y "playtime_2weeks".
df_items = df_items.drop(['steam_id', 'user_url', 'item_name', 'playtime_2weeks'], axis=1)
df_items.sample()

Unnamed: 0,user_id,items_count,item_id,playtime_forever
3715956,wookielol,779,58540,516.0


### Eliminación de valores nulos.

In [12]:
df_items = df_items.dropna()

### Eliminación de valores duplicados.

In [14]:
# Se eliminan los elementos duplicados existentes entre las columnas "user_id" y "item_id".
df_items =df_items.drop_duplicates(subset=['user_id', 'item_id'])

In [15]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5094082 entries, 0 to 5170013
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   item_id           object 
 3   playtime_forever  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 194.3+ MB


Manejo de outliers.

In [16]:
df_items.describe()

Unnamed: 0,items_count,playtime_forever
count,5094082.0,5094082.0
mean,315.7332,990.507
std,696.9087,5414.098
min,1.0,0.0
25%,71.0,0.0
50%,137.0,34.0
75%,275.0,355.0
max,7762.0,642773.0
