## Importamos librerías necesarias

In [4]:
import json
import ast
import re
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [10]:
#cargamos los datos 
df_items = pd.read_parquet('D:\Potato\LABS Henry\PI_ML_OPS\ETL-EDA\Archivos\\australian_users_items.parquet')
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [11]:
df_items.shape

(88310, 5)

## Transformamos el dataset

Desanidamos la lista de la columna 'items'

In [12]:
#normalizamos la columna 'items'
clean_items = pd.json_normalize(df_items['items'].explode())
clean_items.head()

Unnamed: 0,item_id,item_name,playtime_2weeks,playtime_forever
0,10,Counter-Strike,0.0,6.0
1,20,Team Fortress Classic,0.0,0.0
2,30,Day of Defeat,0.0,7.0
3,40,Deathmatch Classic,0.0,0.0
4,50,Half-Life: Opposing Force,0.0,0.0


In [16]:
# unimos el df con el original
df_items = df_items.drop('items', axis= 1)
df_items = clean_items.join(df_items)

In [17]:
df_items.head(1)

Unnamed: 0,item_id,item_name,playtime_2weeks,playtime_forever,user_id,items_count,steam_id,user_url
0,10,Counter-Strike,0.0,6.0,76561197970982479,277.0,76561197970982479,http://steamcommunity.com/profiles/76561197970...


## Buscamos duplicados

In [18]:
df_items.duplicated()

0          False
1          False
2          False
3          False
4          False
           ...  
5170010     True
5170011    False
5170012    False
5170013     True
5170014     True
Length: 5170015, dtype: bool

In [19]:
#eliminamos los duplicados 
df_items = df_items.drop_duplicates(keep='first')
df_items.head(5)

Unnamed: 0,item_id,item_name,playtime_2weeks,playtime_forever,user_id,items_count,steam_id,user_url
0,10,Counter-Strike,0.0,6.0,76561197970982479,277.0,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,20,Team Fortress Classic,0.0,0.0,js41637,888.0,76561198035864385,http://steamcommunity.com/id/js41637
2,30,Day of Defeat,0.0,7.0,evcentric,137.0,76561198007712555,http://steamcommunity.com/id/evcentric
3,40,Deathmatch Classic,0.0,0.0,Riot-Punch,328.0,76561197963445855,http://steamcommunity.com/id/Riot-Punch
4,50,Half-Life: Opposing Force,0.0,0.0,doctr,541.0,76561198002099482,http://steamcommunity.com/id/doctr


In [20]:
#eliminamos columnas que no nos serán utiles para el deploy, así liberamos espacio
df_items.drop(columns=['playtime_2weeks', 'user_url'], inplace= True)

In [21]:
#eliminamos nulos si existen
df_items.dropna(inplace=True)

In [30]:
df_items.head(5)

Unnamed: 0,item_id,item_name,playtime_forever,user_id,items_count,steam_id
0,10,Counter-Strike,6,76561197970982479,277,76561197970982479
1,20,Team Fortress Classic,0,js41637,888,76561198035864385
2,30,Day of Defeat,7,evcentric,137,76561198007712555
3,40,Deathmatch Classic,0,Riot-Punch,328,76561197963445855
4,50,Half-Life: Opposing Force,0,doctr,541,76561198002099482


In [29]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88176 entries, 0 to 88309
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_id           88176 non-null  int32 
 1   item_name         88176 non-null  string
 2   playtime_forever  88176 non-null  int32 
 3   user_id           88176 non-null  object
 4   items_count       88176 non-null  int32 
 5   steam_id          88176 non-null  object
dtypes: int32(3), object(2), string(1)
memory usage: 3.7+ MB


In [25]:
# convertimos a tipo de datos las columnas para el modelo

df_items['item_id'] = df_items['item_id'].astype(int)

df_items = df_items.astype({'item_name': 'string'})

In [28]:
df_items['items_count'] = df_items['items_count'].astype(int)
df_items['playtime_forever'] = df_items['playtime_forever'].astype(int)


### Guardamos el DataFrame en archivo parquet para otpimizar su uso

In [31]:
df_items.to_csv('Archivos/user_items.csv', index=False, encoding='utf-8')

In [33]:
#pasamos el csv a parquet
df_items = pd.read_csv(r'D:\Potato\LABS Henry\PI_ML_OPS\ETL-EDA\Archivos\user_items.csv')

tabla = pa.Table.from_pandas(df_items)
pq.write_table(tabla, 'Archivos/user_items.parquet')