Proceso ETL del archivo json output_steam_games

In [2]:
#Importamos las librerias necesarias para poder realizar el proceso de ETL
import pandas as pd
import json
import numpy as np

In [3]:
#Iniciamos con el proceso de Extracción
file_path = 'Datasets\output_steam_games.json' #En este oportunidad extraeremos los datos de un archivo tipo JSON el cual lo asignamos a la variable file_path

# Verificar si el archivo existe
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        # Utilizar list comprehension para cargar el archivo JSON línea por línea
        data = [json.loads(line) for line in file]

    # Crear un DataFrame a partir de la lista de diccionarios
    data_output_games = pd.DataFrame(data)

    # Eliminar las filas no deseadas
    data_output_games.drop(data_output_games.index[0:88310], inplace=True)

except FileNotFoundError:
    print(f"El archivo '{file_path}' no existe.")

except json.JSONDecodeError as e:
    print(f"Error al decodificar JSON: {e}")

except Exception as e:
    print(f"Ocurrió un error: {e}")

In [4]:
#Una vez creado el dataframe lo asignamos al df llamado data_games en el cual eliminamos las columnas que no nos ayudaran en el proceso de Machine Learning
data_games = data_output_games.drop(['url','reviews_url','publisher','title'],axis=1) 
#Reseteamos el indice del dataframe 
data_games = data_games.reset_index(drop=True) 
#Extraemos de la columna 'release_date' el valor del año para poder reemplazarlo por la columna ahora llamada 'Year'
data_games['Year'] = data_games['release_date'].str.extract(r'(\d{4})') 
#Procedemos a ordenar las columnas del dataframe para verlo de una manera más eficiente
columns = data_games.columns
new_columns_order = ['id','app_name','developer','genres','tags','specs','Year','early_access','price']
data_games = data_games[new_columns_order]
#Cambio de los nombres de las columnas
new_names_columns = {'id':'Id_item','app_name':'App_name','developer':'Developer','genres':'Genres','tags':'Tags','specs':'Specifications','release_date':'Year','early_access':'Early_access','price':'Price'}
data_games.rename(columns=new_names_columns,inplace=True)

In [5]:
#Verificamos la suma de los valores nulos de cada columna 
data_games.isnull().sum()

Id_item              2
App_name             2
Developer         3299
Genres            3283
Tags               163
Specifications     670
Year              2168
Early_access         0
Price             1377
dtype: int64

In [6]:
data_games.head()

Unnamed: 0,Id_item,App_name,Developer,Genres,Tags,Specifications,Year,Early_access,Price
0,761140,Lost Summoner Kitty,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]","[Strategy, Action, Indie, Casual, Simulation]",[Single-player],2018.0,False,4.99
1,643980,Ironbound,Secret Level SRL,"[Free to Play, Indie, RPG, Strategy]","[Free to Play, Strategy, Indie, RPG, Card Game...","[Single-player, Multi-player, Online Multi-Pla...",2018.0,False,Free To Play
2,670290,Real Pool 3D - Poolians,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]","[Free to Play, Simulation, Sports, Casual, Ind...","[Single-player, Multi-player, Online Multi-Pla...",2017.0,False,Free to Play
3,767400,弹炸人2222,彼岸领域,"[Action, Adventure, Casual]","[Action, Adventure, Casual]",[Single-player],2017.0,False,0.99
4,773570,Log Challenge,,,"[Action, Indie, Casual, Sports]","[Single-player, Full controller support, HTC V...",,False,2.99


In [7]:
# Encuentra las filas que contienen NaN
rows_with_nan = data_games[data_games.isna().any(axis=1)]

# Obtiene el número de filas con NaN
num_rows_with_nan = rows_with_nan.shape[0]
print(num_rows_with_nan)


4967


In [8]:
#Creamos una copia del df para poder tener un respaldo de lo realizado hasta el momento
data_games_cleaned = data_games.copy()
#Eliminamos los valores nulos de cada columna del df
data_games_cleaned =data_games_cleaned.dropna()
#Lista los valores unicos de la columna Price del dataframe donde observamos valores que no se incluiran en el análisis 
data_games_cleaned['Price'].unique()

array([4.99, 'Free To Play', 'Free to Play', 0.99, 3.99, 9.99, 18.99,
       29.99, 10.99, 2.99, 1.59, 14.99, 1.99, 59.99, 8.99, 6.99, 7.99,
       39.99, 'Free', 19.99, 7.49, 12.99, 5.99, 2.49, 15.99, 1.25, 24.99,
       17.99, 61.99, 3.49, 11.99, 13.99, 'Free Demo', 'Play for Free!',
       34.99, 1.49, 32.99, 99.99, 14.95, 69.99, 16.99, 79.99, 49.99, 5.0,
       13.98, 109.99, 149.99, 771.71, 'Install Now', 21.99, 89.99,
       'Play WARMACHINE: Tactics Demo', 0.98, 139.92, 4.29, 'Free Mod',
       54.99, 64.99, 74.99, 'Install Theme', 0.89, 'Third-party', 0.5,
       'Play Now', 299.99, 1.29, 119.99, 44.99, 3.0, 15.0, 5.49, 23.99,
       1.39, 'Free HITMAN™ Holiday Pack', 36.99, 4.49, 2.0, 4.0, 1.95,
       1.5, 6.66, 27.99, 26.99, 399.99, 31.99, 20.0, 40.0, 3.33, 22.99,
       38.85, 71.7, 995.0, 27.49, 3.39, 6.0, 19.95, 20.99, 499.99, 199.99,
       16.06, 4.68, 131.4, 44.98, 202.76, 2.3, 0.95, 172.24, 249.99, 2.97,
       10.96, 10.0, 30.0, 2.66, 6.48, 1.0, 11.15, 'Play the Demo

In [9]:
# Limpieza de la columna Price 
# Paso 1: Filtrar las filas no necesarias
rows_to_drop = data_games_cleaned[data_games_cleaned['Price'].str.contains(r'[a-zA-Z:]', na=False) & ~data_games_cleaned['Price'].str.contains('Free', case=False, na=False)]
# Eliminar las filas filtradas
data_games_cleaned.drop(rows_to_drop.index, inplace=True)

# Paso 2: Transformación de datos de la columna Price 
# Convertir los valores que contienen la palabra Free a 0
data_games_cleaned['Price'] = data_games_cleaned['Price'].replace(['Free', 'Free to Play','Free To Play','Free Demo','Play for Free!','Free Mod','Free HITMAN™ Holiday Pack','Play WARMACHINE: Tactics Demo','Free Mod','Play the Demo'], 0)

# Paso 3: Convertir la columna 'Price' a valores numéricos, tratando de convertir 'coerce' para manejar valores no numéricos
data_games_cleaned['Price'] = pd.to_numeric(data_games_cleaned['Price'], errors='coerce')



In [10]:
#Transformacion de la columna Year
#Cambia el tipo de dato de la columna Year a tipo int 
data_games_cleaned['Year'] = data_games_cleaned['Year'].astype(int) 

In [11]:
#Limpieza y Transformacion de la columna Genres
# Convertir la columna 'Genres' a cadenas (si no lo es)
data_games_cleaned['Genres'] = data_games_cleaned['Genres'].astype(str)

# Crear una nueva columna 'Genres' con solo el primer valor de la lista
data_games_cleaned['Genres'] = data_games_cleaned['Genres'].str.strip('[]').str.split(',').str[0].str.replace("'", '')

# Convertir la columna 'Genres' a tipo object
data_games_cleaned['Genres'] = data_games_cleaned['Genres'].astype(object)

# Mostrar el DataFrame resultante
data_games_cleaned


Unnamed: 0,Id_item,App_name,Developer,Genres,Tags,Specifications,Year,Early_access,Price
0,761140,Lost Summoner Kitty,Kotoshiro,Action,"[Strategy, Action, Indie, Casual, Simulation]",[Single-player],2018,False,4.99
1,643980,Ironbound,Secret Level SRL,Free to Play,"[Free to Play, Strategy, Indie, RPG, Card Game...","[Single-player, Multi-player, Online Multi-Pla...",2018,False,0.00
2,670290,Real Pool 3D - Poolians,Poolians.com,Casual,"[Free to Play, Simulation, Sports, Casual, Ind...","[Single-player, Multi-player, Online Multi-Pla...",2017,False,0.00
3,767400,弹炸人2222,彼岸领域,Action,"[Action, Adventure, Casual]",[Single-player],2017,False,0.99
5,772540,Battle Royale Trainer,Trickjump Games Ltd,Action,"[Action, Adventure, Simulation, FPS, Shooter, ...","[Single-player, Steam Achievements]",2018,False,3.99
...,...,...,...,...,...,...,...,...,...
32129,745400,Kebab it Up!,Bidoniera Games,Action,"[Action, Indie, Casual, Violent, Adventure]","[Single-player, Steam Achievements, Steam Cloud]",2018,False,1.99
32130,773640,Colony On Mars,"Nikita ""Ghost_RUS""",Casual,"[Strategy, Indie, Casual, Simulation]","[Single-player, Steam Achievements]",2018,False,1.99
32131,733530,LOGistICAL: South Africa,Sacada,Casual,"[Strategy, Indie, Casual]","[Single-player, Steam Achievements, Steam Clou...",2018,False,4.99
32132,610660,Russian Roads,Laush Dmitriy Sergeevich,Indie,"[Indie, Simulation, Racing]","[Single-player, Steam Achievements, Steam Trad...",2018,False,1.99


In [12]:
#Realizamos una selección de variables con mayor influencia
data_games_load = data_games_cleaned[['Id_item','App_name','Developer','Genres','Year','Early_access','Price']].drop_duplicates()

In [13]:
data_games_load.head()

Unnamed: 0,Id_item,App_name,Developer,Genres,Year,Early_access,Price
0,761140,Lost Summoner Kitty,Kotoshiro,Action,2018,False,4.99
1,643980,Ironbound,Secret Level SRL,Free to Play,2018,False,0.0
2,670290,Real Pool 3D - Poolians,Poolians.com,Casual,2017,False,0.0
3,767400,弹炸人2222,彼岸领域,Action,2017,False,0.99
5,772540,Battle Royale Trainer,Trickjump Games Ltd,Action,2018,False,3.99


In [15]:
#Exportamos el dataframe a un archivo tipo csv
data_games_load.to_csv('Load_data\CSV\Games_output.csv',index=False)