In [479]:
import pandas as pd
import os

# Ruta del archivo original
file_path = '../files/input/solicitudes_de_credito.csv'

# Ruta para guardar el archivo limpio
output_dir = '../files/output'
output_path = os.path.join(output_dir, 'solicitudes_de_credito.csv')

# Leer el archivo CSV con el separador correcto
data = pd.read_csv(file_path, sep=';', encoding='utf-8')




# Eliminar registros con valores faltantes
data = data.dropna()
data_cleaned=data.copy()

# Eliminar la columna innecesaria
data_cleaned = data.drop(columns=["Unnamed: 0"])


data_cleaned['monto_del_credito'] = data_cleaned['monto_del_credito'].replace({'\$': '', ',': '', ' ': ''}, regex=True)
data_cleaned['barrio'] = data_cleaned['barrio'].str.replace('_', ' ').str.replace('-', ' ')
data_cleaned['idea_negocio'] = data_cleaned['idea_negocio'].str.replace('_', ' ').str.replace('-', ' ')
data_cleaned['línea_credito'] = data_cleaned['línea_credito'].str.replace('-', ' ').str.replace('_', ' ')


# # Normalizar valores de texto
data_cleaned['sexo'] = data_cleaned['sexo'].str.lower().str.strip()
# # Normalizar valores de texto
data_cleaned['tipo_de_emprendimiento'] = data_cleaned['tipo_de_emprendimiento'].str.lower().str.strip()
# # Normalizar valores de texto
data_cleaned['línea_credito'] = data_cleaned['línea_credito'].str.lower().str.strip()
# # Normalizar valores de texto
data_cleaned['idea_negocio'] = data_cleaned['idea_negocio'].str.lower().str.strip()
# # Normalizar valores de texto
data_cleaned['barrio'] = data_cleaned['barrio'].str.lower()



def es_convertible_a_entero(valor):
    try:
        int(valor)
        return True
    except ValueError:
        return False

# Filtrar el DataFrame para mantener solo los registros convertibles a entero
data_cleaned = data_cleaned[data_cleaned['comuna_ciudadano'].apply(es_convertible_a_entero)]

#Ordenar fecha
data_cleaned['fecha_de_beneficio'] = data_cleaned['fecha_de_beneficio'].str.replace("-","/").str.strip()


def ordenarfecha(fecha_de_beneficio):
    if int(fecha_de_beneficio.split("/")[0])>31:
        return pd.to_datetime(fecha_de_beneficio, format='%Y/%m/%d')
    else:
        return pd.to_datetime(fecha_de_beneficio, format='%d/%m/%Y')
    
data_cleaned['fecha_de_beneficio'] = data_cleaned['fecha_de_beneficio'].apply(ordenarfecha)

# Limpiar 'monto_del_credito' y barrio: eliminar caracteres especiales 
# data_cleaned["monto_del_credito"] = (
#     data_cleaned["monto_del_credito"]
#     .str.replace("[^0-9]", "", regex=True)
#     .astype(float)
#  )


#castear las columnas
data_cleaned['monto_del_credito'] = data_cleaned['monto_del_credito'].astype(float)
data_cleaned['comuna_ciudadano'] = data_cleaned['comuna_ciudadano'].astype(int)
data_cleaned['estrato'] = data_cleaned['estrato'].astype(int)
data_cleaned['barrio'] = data_cleaned['barrio'].astype(str)


# # Eliminar registros duplicados
data_cleaned = data_cleaned.drop_duplicates()

# Crear la carpeta de salida si no existe
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Guardar el archivo limpio
data_cleaned.to_csv(output_path, index=False)

# Resumen de la limpieza
print("Forma después de la limpieza:", data_cleaned.shape)
print("Valores faltantes restantes:")
print(data_cleaned.isnull().sum())
print(f"Archivo limpio guardado en: {output_path}")

Forma después de la limpieza: (10206, 9)
Valores faltantes restantes:
sexo                      0
tipo_de_emprendimiento    0
idea_negocio              0
barrio                    0
estrato                   0
comuna_ciudadano          0
fecha_de_beneficio        0
monto_del_credito         0
línea_credito             0
dtype: int64
Archivo limpio guardado en: ../files/output\solicitudes_de_credito.csv


In [480]:
# import pandas as pd
# import os

# # Crear la carpeta de salida si no existe
# output_dir = "../files/output"
# os.makedirs(output_dir, exist_ok=True)

# # Cargar el archivo CSV
# df = pd.read_csv("../files/input/solicitudes_de_credito.csv", sep=';')

# # Eliminar registros duplicados
# df = df.drop_duplicates()

# # Eliminar filas con datos faltantes
# df = df.dropna()

# # Normalizar nombres de columnas
# df.columns = df.columns.str.lower().str.replace(' ', '_')

# # Normalizar valores de texto
# df['sexo'] = df['sexo'].str.lower()

# # Guardar el DataFrame limpio en un nuevo archivo CSV
# df.to_csv(f"{output_dir}/solicitudes_de_credito_limpio.csv", index=False)



# df['fecha_de_beneficio'] = pd.to_datetime(df['fecha_de_beneficio'], errors='coerce')
# df['monto_del_credito'] = df['monto_del_credito'].replace({'\$': '', ',': ''}, regex=True).astype(float)
# Q1 = df['monto_del_credito'].quantile(0.25)
# Q3 = df['monto_del_credito'].quantile(0.75)
# IQR = Q3 - Q1
# df = df[(df['monto_del_credito'] >= (Q1 - 1.5 * IQR)) & (df['monto_del_credito'] <= (Q3 + 1.5 * IQR))]
# df['tipo_de_emprendimiento'] = df['tipo_de_emprendimiento'].str.lower().str.replace(' ', '_')
# df = df.drop_duplicates(subset=['sexo', 'tipo_de_emprendimiento'])
# df['estrato'] = df['estrato'].fillna(df['estrato'].median())



In [481]:
data_cleaned.sexo.value_counts().to_list() == [6617, 3589] 

True

In [482]:
data_cleaned.tipo_de_emprendimiento.value_counts().to_list() ==  [
        5636,
        2205,
        2201,
        164,
    ]

True

In [483]:
data_cleaned.idea_negocio.value_counts().to_list() == [
        1844,
        1671,
        983,
        955,
        584,
        584,
        273,
        216,
        164,
        160,
        159,
        151,
        142,
        140,
        134,
        127,
        106,
        102,
        93,
        91,
        90,
        85,
        79,
        74,
        68,
        58,
        57,
        55,
        54,
        45,
        42,
        40,
        40,
        40,
        39,
        37,
        36,
        34,
        33,
        32,
        32,
        30,
        29,
        28,
        26,
        23,
        23,
        22,
        22,
        21,
        20,
        19,
        19,
        18,
        14,
        12,
        12,
        11,
        10,
        9,
        9,
        9,
        8,
        7,
        7,
        7,
        6,
        6,
        6,
        5,
        5,
        5,
        4,
        3,
        2,
    ]

True

In [484]:
len( data_cleaned.barrio.value_counts().to_list()) == len( [
        990,
        483,
        423,
        383,
        376,
        372,
        361,
        348,
        328,
        308,
        270,
        255,
        255,
        247,
        234,
        232,
        231,
        202,
        174,
        170,
        169,
        124,
        117,
        115,
        114,
        90,
        89,
        89,
        86,
        85,
        78,
        72,
        70,
        67,
        65,
        59,
        55,
        52,
        50,
        49,
        48,
        48,
        48,
        47,
        45,
        44,
        43,
        43,
        43,
        40,
        38,
        37,
        36,
        36,
        34,
        34,
        33,
        33,
        32,
        30,
        27,
        27,
        27,
        26,
        26,
        25,
        25,
        24,
        24,
        24,
        24,
        23,
        21,
        21,
        21,
        20,
        20,
        20,
        20,
        17,
        17,
        17,
        16,
        14,
        14,
        14,
        14,
        13,
        13,
        12,
        11,
        11,
        11,
        11,
        10,
        10,
        10,
        9,
        9,
        9,
        9,
        8,
        8,
        8,
        8,
        8,
        8,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        5,
        5,
        5,
        5,
        5,
        5,
        4,
        4,
        4,
        4,
        4,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
    ])

True

In [485]:
data_cleaned.estrato.value_counts().to_list() == [
        5023,
        3151,
        2029,
        3,
    ]

True

In [486]:
data_cleaned.comuna_ciudadano.value_counts().to_list() == [
        1326,
        1133,
        968,
        830,
        830,
        729,
        667,
        636,
        588,
        559,
        426,
        391,
        296,
        267,
        227,
        191,
        64,
        29,
        27,
        12,
        10,
    ]

True

In [487]:
data_cleaned.fecha_de_beneficio.value_counts().to_list() == [
        61,
        58,
        41,
        39,
        39,
        39,
        38,
        37,
        37,
        37,
        37,
        36,
        35,
        34,
        34,
        34,
        33,
        33,
        32,
        32,
        31,
        31,
        31,
        31,
        30,
        30,
        30,
        30,
        30,
        30,
        29,
        29,
        29,
        29,
        29,
        29,
        28,
        28,
        28,
        28,
        27,
        27,
        27,
        27,
        27,
        27,
        27,
        27,
        27,
        26,
        26,
        26,
        26,
        25,
        25,
        25,
        25,
        25,
        25,
        25,
        25,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        23,
        23,
        23,
        23,
        23,
        23,
        23,
        23,
        23,
        22,
        22,
        22,
        22,
        22,
        22,
        22,
        22,
        22,
        22,
        22,
        22,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        21,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        20,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        19,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        18,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        17,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        16,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        15,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        14,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        13,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        11,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
    ]

True

In [488]:
data_cleaned.monto_del_credito.value_counts().to_list() == [
        1174,
        1056,
        1048,
        942,
        757,
        618,
        583,
        482,
        469,
        385,
        285,
        181,
        148,
        147,
        143,
        131,
        130,
        125,
        106,
        95,
        93,
        70,
        60,
        59,
        51,
        43,
        27,
        24,
        22,
        19,
        17,
        17,
        16,
        16,
        15,
        14,
        14,
        12,
        11,
        11,
        11,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        9,
        9,
        9,
        9,
        8,
        8,
        8,
        8,
        8,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        6,
        6,
        6,
        6,
        6,
        6,
        6,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        5,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        4,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
    ]


True

In [489]:
data_cleaned.línea_credito.value_counts().to_list() == [
        10020,
        70,
        55,
        33,
        21,
        4,
        1,
        1,
        1,
    ]

True