# Notebook para el modelo de predecir los pagos en 2025

In [82]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil import parser
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [72]:
df = pd.read_csv('../data/df_final_joined_2025.csv')

In [73]:
df.head()

Unnamed: 0,idCredito,montoExigible,montoCobrar,montoCobrado,vecesPagadas,totalCobros,ultimoCobro,proporcionPagos,pagare,capital,fechaAperturaCredito,montoCobrado2025,proporcionPagada2025
0,6,375.47,375.47,0.0,0.0,1,14/08/2023,0.0,27033.84,12000.0,12/03/2012,0.0,0.0
1,3674,344.18,344.18,0.0,0.0,1,10/05/2024,0.0,24780.96,11000.0,29/03/2012,,
2,4635,1441.17,1441.17,160.13,160.13,9,13/03/2023,17.792222,32665.68,14500.0,02/04/2012,320.26,0.009804
3,4914,3949.8,3949.8,384.98,384.98,10,13/03/2023,38.498,34918.56,15500.0,03/04/2012,0.0,0.0
4,8947,2316.06,2316.06,0.0,0.0,9,13/03/2023,0.0,33380.64,12500.0,25/04/2012,0.0,0.0


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69301 entries, 0 to 69300
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   idCredito             69301 non-null  int64  
 1   montoExigible         69301 non-null  float64
 2   montoCobrar           69301 non-null  float64
 3   montoCobrado          69301 non-null  float64
 4   vecesPagadas          69301 non-null  float64
 5   totalCobros           69301 non-null  int64  
 6   ultimoCobro           69301 non-null  object 
 7   proporcionPagos       69301 non-null  float64
 8   pagare                69301 non-null  float64
 9   capital               69301 non-null  float64
 10  fechaAperturaCredito  69301 non-null  object 
 11  montoCobrado2025      63849 non-null  float64
 12  proporcionPagada2025  63849 non-null  float64
dtypes: float64(9), int64(2), object(2)
memory usage: 6.9+ MB


In [75]:
# Find nulls
print(df.isnull().sum())

idCredito                  0
montoExigible              0
montoCobrar                0
montoCobrado               0
vecesPagadas               0
totalCobros                0
ultimoCobro                0
proporcionPagos            0
pagare                     0
capital                    0
fechaAperturaCredito       0
montoCobrado2025        5452
proporcionPagada2025    5452
dtype: int64


In [76]:
# Add a 0 for the nulls
df['proporcionPagada2025'].fillna(0, inplace=True)
df['montoCobrado2025'].fillna(0, inplace=True)
print(df.isnull().sum())

idCredito               0
montoExigible           0
montoCobrar             0
montoCobrado            0
vecesPagadas            0
totalCobros             0
ultimoCobro             0
proporcionPagos         0
pagare                  0
capital                 0
fechaAperturaCredito    0
montoCobrado2025        0
proporcionPagada2025    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['proporcionPagada2025'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['montoCobrado2025'].fillna(0, inplace=True)


In [77]:
X= df.drop(columns=['idCredito', 'montoCobrado2025', 'proporcionPagada2025','ultimoCobro'])

In [52]:
"""
# Transform fechaEnvioCobro into month and day
X['ultimoCobro'] = pd.to_datetime(X['ultimoCobro'], format='%d/%m/%Y')
X['month_uc'] = X['ultimoCobro'].dt.month
X['day_uc'] = X['ultimoCobro'].dt.day
X['year_uc'] = X['ultimoCobro'].dt.year
# Drop the column 'ultimoCobro'
X.drop(columns=['ultimoCobro'], inplace=True)
"""

"\n# Transform fechaEnvioCobro into month and day\nX['ultimoCobro'] = pd.to_datetime(X['ultimoCobro'], format='%d/%m/%Y')\nX['month_uc'] = X['ultimoCobro'].dt.month\nX['day_uc'] = X['ultimoCobro'].dt.day\nX['year_uc'] = X['ultimoCobro'].dt.year\n# Drop the column 'ultimoCobro'\nX.drop(columns=['ultimoCobro'], inplace=True)\n"

In [78]:
# Transform fechaEnvioCobro into month and day
X['fechaAperturaCredito'] = pd.to_datetime(X['fechaAperturaCredito'], format='%d/%m/%Y')
X['month_fa'] = X['fechaAperturaCredito'].dt.month
X['day_fa'] = X['fechaAperturaCredito'].dt.day
X['year_fa'] = X['fechaAperturaCredito'].dt.year
# Drop the column 'fechaAperturaCredito'
X.drop(columns=['fechaAperturaCredito'], inplace=True)

In [79]:
df.columns

Index(['idCredito', 'montoExigible', 'montoCobrar', 'montoCobrado',
       'vecesPagadas', 'totalCobros', 'ultimoCobro', 'proporcionPagos',
       'pagare', 'capital', 'fechaAperturaCredito', 'montoCobrado2025',
       'proporcionPagada2025'],
      dtype='object')

In [80]:
y = df['proporcionPagada2025']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [81]:
model = LinearRegression()
model.fit(X_train, y_train)

In [83]:
y_pred = model.predict(X_test)

print("Mean Squared Error:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 0.029754756305311752
R² Score: 0.5229346574228417


# Xgboost

In [84]:
from xgboost import XGBRegressor

model = XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 0.01648540549612157
R² Score: 0.7772325912464912


In [60]:
# Save the model
import joblib
joblib.dump(model, 'xgboostPorcentaje_model.pkl')

['xgboostPorcentaje_model.pkl']

In [61]:
X_test.iloc[0]

montoExigible       8858.01
montoCobrar         8858.01
montoCobrado        8858.01
vecesPagadas        8858.01
totalCobros            7.00
proporcionPagos     1265.43
pagare             60740.64
capital            21619.00
month_fa              11.00
day_fa                16.00
year_fa             2024.00
Name: 64958, dtype: float64

In [62]:
# Test the model with X_test[0]

model.predict(X_test.iloc[0].values.reshape(1, -1))

array([0.10517754], dtype=float32)

# Modelo para predicción de gasto

In [87]:
y = df['montoCobrado2025']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [88]:
model = XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)

In [89]:
y_pred = model.predict(X_test)

print("Mean Squared Error:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 981.1255613077931
R² Score: 0.6595696219129716


In [69]:
model.predict(X_test.iloc[0].values.reshape(1, -1))

array([6486.9575], dtype=float32)

In [70]:
y_test.iloc[0]

np.float64(5061.72)

In [66]:
# Calculate the sum of all the predictions on test set
y_pred_sum = y_pred.sum()
# COmpare with the sum of the actual values
y_test_sum = y_test.sum()
print("Sum of predictions:", y_pred_sum)
print("Sum of actual values:", y_test_sum)

Sum of predictions: 41666064.0
Sum of actual values: 41338696.82


In [67]:
joblib.dump(model, 'xgboostTotal_model.pkl')

['xgboostTotal_model.pkl']