In [46]:
import pandas as pd
import numpy as np
import sklearn as sk

In [47]:
df = pd.read_csv('Data/ListaCobroDetalle2025.csv')
print (df.shape)
df.head()


(2114172, 9)


Unnamed: 0,idListaCobro,idCredito,consecutivoCobro,idBanco,montoExigible,montoCobrar,montoCobrado,fechaCobroBanco,idRespuestaBanco
0,155938,738973,41396434,2,622.87,622.87,0.0,,4.0
1,155938,739017,41396435,2,1069.11,1069.11,0.0,,4.0
2,155939,739185,41396436,2,4340.83,4340.83,4340.83,02/01/2025,0.0
3,155940,732324,41396437,2,2134.21,2134.21,0.0,,4.0
4,155940,737028,41396438,2,815.76,815.76,0.0,,4.0


In [48]:
df_selected = df[['idListaCobro', 'idCredito', 'consecutivoCobro', 'idBanco', 'montoExigible', 'montoCobrar', 'montoCobrado', 'fechaCobroBanco', 'idRespuestaBanco']]
df_selected = df_selected.astype({
    'idListaCobro': 'int64',
    'idCredito': 'int64',
    'consecutivoCobro': 'int64',
    'idBanco': 'int64',
    'montoExigible': 'float64',
    'montoCobrar': 'float64',
    'montoCobrado': 'float64',
    'idRespuestaBanco': 'float64'
})
df_selected['fechaCobroBanco'] = pd.to_datetime(df_selected['fechaCobroBanco'], errors='coerce')
df_selected.head()

Unnamed: 0,idListaCobro,idCredito,consecutivoCobro,idBanco,montoExigible,montoCobrar,montoCobrado,fechaCobroBanco,idRespuestaBanco
0,155938,738973,41396434,2,622.87,622.87,0.0,NaT,4.0
1,155938,739017,41396435,2,1069.11,1069.11,0.0,NaT,4.0
2,155939,739185,41396436,2,4340.83,4340.83,4340.83,2025-02-01,0.0
3,155940,732324,41396437,2,2134.21,2134.21,0.0,NaT,4.0
4,155940,737028,41396438,2,815.76,815.76,0.0,NaT,4.0


In [49]:
df_grouped = df_selected.sort_values('fechaCobroBanco').groupby('idCredito')
print(df_grouped.size())
print(df_grouped.size().max())
print(df_grouped.size().min())

idCredito
9872      18
10983      3
30466     18
31375      2
33591      3
          ..
755683     1
755695     1
755726     1
755859    16
756595     5
Length: 28602, dtype: int64
1116
1


In [50]:
# Imprimeme cuantas ids hay y cuantas son unicas
print(df_grouped.size().count())

28602


In [51]:
# Ahora imprime las variables y sus tipos
print(df_selected.dtypes)

idListaCobro                 int64
idCredito                    int64
consecutivoCobro             int64
idBanco                      int64
montoExigible              float64
montoCobrar                float64
montoCobrado               float64
fechaCobroBanco     datetime64[ns]
idRespuestaBanco           float64
dtype: object


In [52]:
# Transformar variables adecuadas a tipo categórico
categorical_columns = ['idBanco', 'idRespuestaBanco']
df_selected[categorical_columns] = df_selected[categorical_columns].astype('category')
df_selected.dtypes

idListaCobro                 int64
idCredito                    int64
consecutivoCobro             int64
idBanco                   category
montoExigible              float64
montoCobrar                float64
montoCobrado               float64
fechaCobroBanco     datetime64[ns]
idRespuestaBanco          category
dtype: object

In [56]:
# aplica dummies a las variables categóricas y pasalos a 01
df_dummies = pd.get_dummies(df_selected, columns=categorical_columns)
# Todos los bools pasan a 0 y 1, checa todas las variables bool a int8
df_dummies = df_dummies.astype({col: np.int8 for col in df_dummies.select_dtypes(include=[bool]).columns})


In [54]:
print(df_dummies.dtypes)

idListaCobro                      int64
idCredito                         int64
consecutivoCobro                  int64
montoExigible                   float64
montoCobrar                     float64
montoCobrado                    float64
fechaCobroBanco          datetime64[ns]
idBanco_2                          bool
idBanco_12                         bool
idBanco_14                         bool
idBanco_21                         bool
idBanco_30                         bool
idBanco_36                         bool
idBanco_44                         bool
idBanco_58                         bool
idBanco_62                         bool
idBanco_72                         bool
idBanco_127                        bool
idBanco_132                        bool
idBanco_137                        bool
idBanco_166                        bool
idRespuestaBanco_0.0               bool
idRespuestaBanco_1.0               bool
idRespuestaBanco_2.0               bool
idRespuestaBanco_3.0               bool


In [55]:
# Guarda la version limpia como Data/CleanedData.csv
df_dummies.to_csv('Data/CleanedData.csv', index=False)