# Imports

In [1]:
import sys
import importlib

from utils.data_cleaning_utils import *


# Configuración de Datasets a leer

In [2]:
datasets_dicts= [
    {
        "name": "australia",
        "format": "csv",
        "options": {
            "header": [0,1]
        },
        "final_name" : "australia_data"
    },
    {
        "name": "cardiff",
        "format": "csv",
        "options":{
        },
        "final_name" : "cardiff_data"       
    },
    {
        "name": "chile",
        "format": "csv",
        "options":{
            "delimiter": ";",
            "encoding": "latin1",
            "usecols": ["IdEstablecimiento", "NEstablecimiento", "Total", "Menores_1", "De_1_a_4", "De_5_a_14", "De_15_a_64", "De_65_y_mas", "fecha", "semana"]
        },
        "final_name" : "chile_data"    
    },
    {
        "name": "colombia",
        "format": "csv",
        "options":{
        },
        "final_name" : "colombia_data"       
    },
    {
        "name": "col_betania",
        "format": "csv",
        "options":{   
        },
        "final_name" : "colombia_data"   
    },
    {
        "name": "esp_canarias",
        "format": "csv",
        "options":{   
        },
        "final_name" : "spain_data"   
    },
    {
        "name": "esp_castilla_y_leon",
        "format": "csv",
        "options":{
            "delimiter": ";",
            "encoding": "utf-8-sig"
        },
        "final_name" : "spain_data"   
    },
    {
        "name": "iowa",
        "format": "xlsx",
        "options":{
            "header": 3
        },
        "final_name" : "usa_data"   
    },
    {
        "name": "iran",
        "format": "csv",
        "options":{
        },
        "final_name" : "iran_data"   
    },
    {
        "name": "mexico_2009",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None,
            "dtype":{
                2: str, 7: str, 13: str, 18: str, 19: str, 20: str, 21: str
            }
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2010",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2011",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2012",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2013",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2014",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2015",
        "format": "csv",
        "options":{
            "delimiter": ",",
            "usecols": ["CLUES", "FECHAINGRESO", ",HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2016",
        "format": "csv",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "FECHAINGRESO", ",HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2017",
        "format": "csv",
        "options":{
            "delimiter": "|",
            "encoding": "latin1",
            "usecols": ["CLUES", "FECHAINGRESO", ",HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2018",
        "format": "csv",
        "options":{
            "delimiter": ",",
            "usecols": ["CLUES", "FECHAINGRESO", ",HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2019",
        "format": "csv",
        "options":{
            "delimiter": ",",
            "usecols": ["CLUES", "FECHAINGRESO", ",HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2020",
        "format": "txt",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "fechaingreso", "hora_ingreso"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2021",
        "format": "txt",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "FECHAINGRESO", "HORA_INGRESO"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2022",
        "format": "txt",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "fechaingreso", "hora_ingreso"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "mexico_2023",
        "format": "txt",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "fechaingreso", "hora_ingreso"]
        },
        "final_name" : "mexico_data"   
    },
    {
        "name": "pak_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "pakistan_data"   
    },
    {
        "name": "usa_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "usa_data"   
    },
    {
        "name": "nl_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "netherlands_data"   
    },
    {
        "name": "bwa_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "botswana_data"   
    },
    {
        "name": "aus_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "australia_data"   
    },
    {
        "name": "wales",
        "format": "csv",
        "options":{
            "encoding": "latin1",
            "skiprows": 2
        },
        "final_name" : "wales_data"   
    }
    
]

# Procesado de los Datasets

In [3]:
for dataset in datasets_dicts:
    name = dataset["name"]
    format = dataset["format"]
    options = dataset["options"]
    final_name = dataset["final_name"]

    matching_files = read_multi_file_paths(format, name)
    if not matching_files:
        raise ValueError(f"No matching files found for dataset '{name}'")
    
    df_list = []

    for path in matching_files:
        # if "mexico_2021" not in path:
        #     continue
        # Se leen los datos
        df = read_raw_data(format, path, options)
        
        process_func_name = f"process_{name}"
        process_func = globals().get(process_func_name)

        if process_func is None:
            raise ValueError(f"No se encontró la función '{process_func_name}'")

        # Procesado del DataFrame
        processed_df = process_func(df)
        
        df_list.append(processed_df)
        df_final = pd.concat(df_list, ignore_index=True)
        
        # Se guardan los datos procesados
        save_clean_data(df_final, final_name)

Leyendo archivo: ../.gitignore/raw_datasets\australia_data.csv
DataFrame procesado con 3285 filas y 3 columnas.
Archivo guardado exitosamente en: ../datasets/clean_datasets/australia_data.parquet
Leyendo archivo: ../.gitignore/raw_datasets\cardiff_1_data.csv
DataFrame procesado con 43080 filas y 3 columnas.
Archivo guardado exitosamente en: ../datasets/clean_datasets/cardiff_data.parquet
Leyendo archivo: ../.gitignore/raw_datasets\cardiff_2_data.csv
DataFrame procesado con 43081 filas y 3 columnas.
Archivo guardado exitosamente en: ../datasets/clean_datasets/cardiff_data.parquet
Leyendo archivo: ../.gitignore/raw_datasets\chile_2008_data.csv
DataFrame procesado con 10574 filas y 3 columnas.
Archivo guardado exitosamente en: ../datasets/clean_datasets/chile_data.parquet
Leyendo archivo: ../.gitignore/raw_datasets\chile_2009_data.csv
DataFrame procesado con 55108 filas y 3 columnas.
Archivo guardado exitosamente en: ../datasets/clean_datasets/chile_data.parquet
Leyendo archivo: ../.gitig

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
df["FECHAINGRESO"] = pd.to_datetime(df["FECHAINGRESO"], errors="coerce")

# Crea columna 'datetime' combinando la fecha y la hora
df["datetime"] = pd.to_datetime(
    df["FECHAINGRESO"].dt.strftime("%Y-%m-%d") + " " + df["HORA_INGRESO"],
    errors="coerce"
)
df

Unnamed: 0,CLUES,FECHAINGRESO,HORA_INGRESO,datetime
0,NLSSA014295,2021-11-13,10:01,2021-11-13 10:01:00
1,NLSSA014295,2021-02-11,12:11,2021-02-11 12:11:00
2,NLSSA014295,2021-09-25,99:99,NaT
3,NLSSA014295,2021-09-30,99:99,NaT
4,NLSSA014295,2021-11-23,99:99,NaT
...,...,...,...,...
5383300,DFSSA004084,2021-12-19,18:08,2021-12-19 18:08:00
5383301,DFSSA004084,2021-12-06,07:10,2021-12-06 07:10:00
5383302,DFSSA004084,2021-04-25,17:12,2021-04-25 17:12:00
5383303,DFSSA004084,2021-07-09,08:50,2021-07-09 08:50:00


In [None]:
df

KeyError: "None of [Index(['FECHAINGRESO', 'CLUES'], dtype='object')] are in the [columns]"

In [None]:

# # Se normalizan los nombres de las columnas
# df.columns = df.columns.str.strip()

# # Se filtran las filas que contienen datos numéricos en la columna "Wales" para evitar seleccionar los metadatos
# df_filtrado = df[pd.to_numeric(df["Wales"], errors="coerce").notna()]

# df_filtrado = df_filtrado.copy()

# df_filtrado = df_filtrado.rename(columns={df_filtrado.columns[0]: "Date"})


# df_filtrado["Date"] = df_filtrado["Date"].astype(str).str.strip().str.replace('"', '')
# df_filtrado = df_filtrado[df_filtrado["Date"].str.match(r"^\d")]

# df_final = df_filtrado.melt(
#     id_vars=["Date"],                 
#     value_vars=[col for col in df_filtrado.columns if col != "Date"],
#     var_name="Hospital",
#     value_name="Admissions"
# )
# df_final = df_final[df_final["Hospital"] != "Wales"]
