# Imports

In [1]:
import sys
import importlib

from utils.data_cleaning_utils import *


# Configuración de Datasets a leer

In [2]:
datasets_dicts= [
    {
        "name": "australia",
        "format": "csv",
        "options": {
            "header": [0,1]
        },
        "final_name" : "australia_data"
    },
    {
        "name": "cardiff",
        "format": "csv",
        "options":{
        },
        "final_name" : "cardiff_data"       
    },
    {
        "name": "chile",
        "format": "csv",
        "options":{
            "delimiter": ";",
            "encoding": "latin1",
            "usecols": ["IdEstablecimiento", "NEstablecimiento", "Total", "Menores_1", "De_1_a_4", "De_5_a_14", "De_15_a_64", "De_65_y_mas", "fecha", "semana"]
        },
        "final_name" : "chile_data",
        "large_file": True
    },
    {
        "name": "colombia",
        "format": "csv",
        "options":{
        },
        "final_name" : "colombia_data"       
    },
    {
        "name": "col_betania",
        "format": "csv",
        "options":{   
        },
        "final_name" : "betania_data"   
    },
    {
        "name": "esp_canarias",
        "format": "csv",
        "options":{   
        },
        "final_name" : "spain_data"   
    },
    {
        "name": "esp_castilla_y_leon",
        "format": "csv",
        "options":{
            "delimiter": ";",
            "encoding": "utf-8-sig"
        },
        "final_name" : "spain_data"   
    },
    {
        "name": "iowa",
        "format": "xlsx",
        "options":{
            "header": 3
        },
        "final_name" : "usa_data"   
    },
    {
        "name": "iran",
        "format": "csv",
        "options":{
        },
        "final_name" : "iran_data"   
    },
    {
        "name": "mexico_2009",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None,
            "dtype": "string"
        },
        "final_name" : "mexico_data",
        "large_file": True
    },
    {
        "name": "mexico_2010",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None,
            "dtype": "str"
        },
        "final_name" : "mexico_data",
        "large_file": True  
    },
    {
        "name": "mexico_2011",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None,
            "dtype": "str"
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2012",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None,
            "dtype": "str"
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2013",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None,
            "dtype": "str"
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2014",
        "format": "csv",
        "options":{
            "delimiter": ";",  
            "header": None,
            "dtype": "str"
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2015",
        "format": "csv",
        "options":{
            "delimiter": ",",
            "usecols": ["CLUES", "FECHAINGRESO", "HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2016",
        "format": "csv",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "FECHAINGRESO", "HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2017",
        "format": "csv",
        "options":{
            "delimiter": "|",
            "encoding": "latin1",
            "usecols": ["CLUES", "FECHAINGRESO", "HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2018",
        "format": "csv",
        "options":{
            "delimiter": ",",
            "usecols": ["CLUES", "FECHAINGRESO", "HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2019",
        "format": "csv",
        "options":{
            "delimiter": ",",
            "usecols": ["CLUES", "FECHAINGRESO", "HORAINIATE", "MININIATE"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2020",
        "format": "txt",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "fechaingreso", "hora_ingreso"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2021",
        "format": "txt",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "FECHAINGRESO", "HORA_INGRESO"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2022",
        "format": "txt",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "fechaingreso", "hora_ingreso"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "mexico_2023",
        "format": "txt",
        "options":{
            "delimiter": "|",
            "usecols": ["CLUES", "fechaingreso", "hora_ingreso"]
        },
        "final_name" : "mexico_data",
        "large_file": True   
    },
    {
        "name": "pak_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "pakistan_data"   
    },
    {
        "name": "usa_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "usa_data"   
    },
    {
        "name": "nl_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "netherlands_data"   
    },
    {
        "name": "bwa_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "botswana_data"   
    },
    {
        "name": "aus_",
        "format": "xlsx",
        "options":{
        },
        "final_name" : "australia_data"   
    },
    {
        "name": "wales",
        "format": "csv",
        "options":{
            "encoding": "latin1",
            "skiprows": 2
        },
        "final_name" : "wales_data"   
    }
    
]

# Procesado de los Datasets

In [3]:
for dataset in datasets_dicts:
    name = dataset["name"]
    format = dataset["format"]
    options = dataset["options"]
    final_name = dataset["final_name"]
    large_file= dataset.get("large_file", False)

    matching_files = read_multi_file_paths(format, name)
    if not matching_files:
        raise ValueError(f"No matching files found for dataset '{name}'")
    
    df_list = []

    for path in matching_files:
        
        # Se leen los datos
        df = read_raw_data(format, path, options, large_file)
        
        process_func_name = f"process_{name}"
        process_func = globals().get(process_func_name)

        if process_func is None:
            raise ValueError(f"No se encontró la función '{process_func_name}'")

        # Procesado del DataFrame
        processed_df = process_func(df)

        proccesed_df = cast_date_column(processed_df)
        
        df_list.append(processed_df)
    
    if df_list:
        df_final = pd.concat(df_list, ignore_index=True)
        
        # Se guardan los datos procesados
        save_clean_data(df_final, final_name)

Leyendo archivo: ../datasets/raw_datasets\australia_data.csv
DataFrame procesado con 3285 filas y 3 columnas.


  df['date'] = pd.to_datetime(df['date'], errors='coerce')


Archivo guardado exitosamente en: ../datasets/clean_datasets/australia_data.parquet
Leyendo archivo: ../datasets/raw_datasets\cardiff_1_data.csv
DataFrame procesado con 43080 filas y 3 columnas.
Leyendo archivo: ../datasets/raw_datasets\cardiff_2_data.csv
DataFrame procesado con 43081 filas y 3 columnas.
Archivo guardado exitosamente en: ../datasets/clean_datasets/cardiff_data.parquet
Leyendo archivo: ../datasets/raw_datasets\chile_2008_data.csv
DataFrame procesado con 10574 filas y 3 columnas.
Leyendo archivo: ../datasets/raw_datasets\chile_2009_data.csv
DataFrame procesado con 55108 filas y 3 columnas.
Leyendo archivo: ../datasets/raw_datasets\chile_2010_data.csv
DataFrame procesado con 116456 filas y 3 columnas.
Leyendo archivo: ../datasets/raw_datasets\chile_2011_data.csv
DataFrame procesado con 129775 filas y 3 columnas.
Leyendo archivo: ../datasets/raw_datasets\chile_2012_data.csv
DataFrame procesado con 140599 filas y 3 columnas.
Leyendo archivo: ../datasets/raw_datasets\chile_2

MemoryError: Unable to allocate 1.84 GiB for an array with shape (26, 9510442) and data type object