In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import json

# Este código lee un archivo JSON que contiene múltiples objetos JSON en líneas separadas, los analiza y almacena en una lista

ruta_archivo = '/content/drive/My Drive/PI MLOps - STEAM/output_steam_games.json'

with open(ruta_archivo, 'r') as file:
    json_string = file.read()

json_objeto = json_string.split('\n')
json_lista = []

for json_obj in json_objeto:
    try:
        analisis_obj = json.loads(json_obj)
        json_lista.append(analisis_obj)
    except json.JSONDecodeError:
        pass


In [None]:
#Este código toma una lista de objetos JSON, los concatena en un único DataFrame llamado steam_games.
#Cada objeto JSON se convierte en una fila del DataFrame final, y las columnas del DataFrame corresponden a las claves del objeto JSON.

import pandas as pd

dataframes=[]
for json_obj in json_lista:
    df = pd.DataFrame([json_obj])
    dataframes.append(df)

steam_games= pd.concat(dataframes, ignore_index=True)

In [None]:
#Este codigo elimina columnas innecesarias para el analisis

eliminar_columnas = ['specs', 'url', 'reviews_url', 'discount_price', 'publisher', 'price', 'metascore', 'tags', 'developer', 'app_name', 'items_count', 'steam_id', 'early_access','title','id']
steam_games = steam_games.drop(eliminar_columnas, axis=1)

In [None]:
#Como la columna 'items' contiene listas, explode creará una nueva fila para cada elemento de la lista, y el resto de las
# columnas se duplicarán según corresponda.
#Normalized convierte las estructuras JSON anidadas en columnas separadas en el DataFrame resultante

steam_games_exploded_items = steam_games.explode('items')
steam_games_normalized_items = pd.json_normalize(steam_games_exploded_items['items'])

In [None]:
#Ahora se concatena

df_games = pd.concat([steam_games, steam_games_normalized_items], axis=1)

In [None]:
#elimino columna desanidada

eliminar_columnas = ['items']
df_games = df_games.drop(eliminar_columnas, axis=1)

In [None]:
#observo nulos

print(df_games.isnull().sum())

genres              5173298
title               5172065
release_date        5172082
id                  5170017
user_id             5113840
item_id               48941
item_name             48941
playtime_forever      48941
playtime_2weeks       48941
dtype: int64


In [None]:
#elimino nulos
df_games = df_games.dropna(subset =['item_id'])

In [None]:
df_games = df_games.dropna(subset =['release_date'])

In [None]:
#elimino columna innecesaria

eliminar_columnas = ['playtime_2weeks']
df_games = df_games.drop(eliminar_columnas, axis=1)

In [None]:

# Función para aplicar str.lower() a una celda
def to_lower(x):
    if isinstance(x, str):
        return x.lower()
    elif isinstance(x, list):
        # Si es una lista, aplicar str.lower() a cada elemento
        return [item.lower() for item in x]
    else:
        return x

# Aplicar to_lower a todas las celdas del DataFrame
df_games = df_games.applymap(to_lower)





In [None]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5202150 entries, 0 to 5202149
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   genres            object 
 1   title             object 
 2   release_date      object 
 3   id                object 
 4   user_id           object 
 5   item_id           object 
 6   item_name         object 
 7   playtime_forever  float64
dtypes: float64(1), object(7)
memory usage: 317.5+ MB


In [None]:
# solo me interesa presenvar la columna genres de este dataframe
df_games = df_games.dropna(subset=['genres'])

In [None]:
df_games['genres']

88310         [action, casual, indie, simulation, strategy]
88311                  [free to play, indie, rpg, strategy]
88312     [casual, free to play, indie, simulation, sports]
88313                           [action, adventure, casual]
88315                       [action, adventure, simulation]
                                ...                        
120439                   [action, adventure, casual, indie]
120440                [casual, indie, simulation, strategy]
120441                            [casual, indie, strategy]
120442                          [indie, racing, simulation]
120443                                      [casual, indie]
Name: genres, Length: 30034, dtype: object

In [None]:
# guardo el resultado en un csv
ruta_guardar = '/content/drive/My Drive/PI MLOps - STEAM/Games.csv'
df_games.to_csv(ruta_guardar, index=False)  # index=False evita que se guarde el índice del DataFrame en el archivo