In [1]:
# Load de data

import zipfile
import pandas as pd

def load_data(zip_path):
    try:
        with zipfile.ZipFile(zip_path,"r") as z:
            csv_filename=z.namelist()[0]

            with z.open(csv_filename) as f:
                df=pd.read_csv(f)
        print(f"File {csv_filename} uploaded succesfully")
        return df
    except Exception as e:
        print(f" Error loagind the file {e}")




In [2]:
test_data=load_data("../files/input/test_data.csv.zip")
train_data=load_data("../files/input/train_data.csv.zip")

File test_default_of_credit_card_clients.csv uploaded succesfully
File train_default_of_credit_card_clients.csv uploaded succesfully


In [3]:
test_data.head(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0
5,19,360000,2,1,1,49,1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
6,21,130000,2,3,2,39,0,0,0,0,...,20616,11802,930,3000,1537,1000,2000,930,33764,0
7,28,50000,2,3,2,30,0,0,0,0,...,17878,18931,19617,1300,1300,1000,1500,1000,1012,0
8,30,50000,1,1,2,26,0,0,0,0,...,17907,18375,11400,1500,1500,1000,1000,1600,0,0
9,31,230000,2,1,2,27,-1,-1,-1,-1,...,15339,14307,36923,17270,13281,15339,14307,37292,0,0


In [4]:
# Step 1
def data_cleaning(dataset, name="dataset"):
    dataset.rename(columns={"default payment next month":"default"}, inplace=True)
    dataset.drop(columns="ID",inplace=True)
    dataset.dropna(inplace=True)
    dataset.loc[dataset["EDUCATION"]>4,"EDUCATION"]=4
    print(f"Data {name} cleaned successfully")
    return dataset


In [5]:
train_data=data_cleaning(train_data,"train_data")
test_data=data_cleaning(test_data,"test_data")

Data train_data cleaned successfully
Data test_data cleaned successfully


In [6]:
train_data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [7]:
# Step 2
def split_data(train_data,test_data):
    X_train=train_data.drop(columns="default")
    X_test=test_data.drop(columns="default")

    y_train=train_data["default"]
    y_test=test_data["default"]

    print("Correctly split dataset")

    return X_train,y_train,X_test,y_test


In [8]:
X_train, y_train, X_test, y_test = split_data(train_data,test_data)

Correctly split dataset


In [9]:
# Step 3

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif,chi2

def classification_pipeline(X_train, y_train):
        
        categorical_cols = [
        'SEX', 'EDUCATION', 'MARRIAGE',
        'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'
    ]

        numeric_cols = [
        'LIMIT_BAL', 'AGE',
        'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
        'PAY_AMT1',  'PAY_AMT2',  'PAY_AMT3',  'PAY_AMT4',  'PAY_AMT5',  'PAY_AMT6'
    ]
        preprocessor=ColumnTransformer([
                ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_cols),
            ('num', MinMaxScaler(),numeric_cols)
        ])

        model=Pipeline([
                ('preprocessor',preprocessor),
                ('selection',SelectKBest(score_func=f_classif,k=20)),
                ('model',LogisticRegression(random_state=42))
        ])

        model.fit(X_train,y_train)
        return model 





In [10]:
model=classification_pipeline(X_train,y_train)

In [11]:
# Step #4
from sklearn.model_selection import GridSearchCV
import numpy as np

def Hyperparameter_optimization(model, X_train, y_train):
    # param_grid = {
    #     "model__max_iter": [3000],   
    #     "selection__k": [6, 8, 10, 12],         
    #     "model__C": [1e-3, 3e-3, 1e-2, 3e-2, 0.1, 0.3] #np.logspace(-10,3,10)       
    # } 

    # param_grid = {
    #     "model__max_iter": [4000],
    #     "selection__k": [6, 8, 10, 12,24,48],
    #     "model__C": np.logspace(-9,2,10) ,
    # }
    # param_grid = {
    #     "selection__k": [10, 15, 20],
    #     "model__C": np.logspace(-4, -1, 5),   # valores más pequeños => menos FP
    #     "model__max_iter": [1000],
    #     "model__solver":['liblinear','lbfgs']
    # }

    param_grid = {
        "selection__k": [8, 10, 12, 15, 20],
        "model__solver": ["liblinear"],         # rápido y permite L1
        "model__penalty": ["l1", "l2"],         # compatible con liblinear
        "model__C": [1e-4, 3e-4, 1e-3, 3e-3, 1e-2],
        "model__max_iter": [2000],
        "model__class_weight": [None, {0: 1.5, 1: 1}],
    }

#     param_grid = [
#     # Con liblinear puedo probar L1 y L2
#     {
#         "selection__k": [8, 10, 12, 15, 20],
#         "model__solver": ["liblinear"],
#         "model__penalty": ["l1", "l2"],
#         "model__C": [1e-4, 3e-4, 1e-3, 3e-3, 1e-2],  # conservador útil
#         "model__max_iter": [2000],
#         "model__class_weight": [None, {0: 1.5, 1: 1}],  # leve sesgo a 0 => menos FP
#     },
#     # Con lbfgs solo L2
#     {
#         "selection__k": [8, 10, 12, 15, 20],
#         "model__solver": ["lbfgs"],
#         "model__penalty": ["l2"],
#         "model__C": [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 0.1],
#         "model__max_iter": [2000],
#         "model__class_weight": [None, {0: 1.5, 1: 1}],
#     },
# ]
        # Regularización básica 
        # K básicas Demasiadas dummies generan FP. Reduce k.
        # coeficiente de rugalizarion penaliza C pequeño-regulazidor fuer-penaliza coeficientes grandes se 
        # vuelve mas concervador al predecir que algo es 1-default
        # menos faltos positivos mayor precisión
    
    

    grid_search=GridSearchCV(estimator=model,
                             param_grid=param_grid,
                             scoring='balanced_accuracy',
                             cv=10,
                             n_jobs=-1,
                             verbose=1,
                             refit=True)


    grid_search.fit(X_train,y_train)

    return grid_search



In [12]:
model=Hyperparameter_optimization(model,X_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [13]:
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score

# === Calcula las predicciones ===
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# === Calcula métricas para entrenamiento ===
precision_train = precision_score(y_train, y_pred_train, zero_division=0)
balanced_acc_train = balanced_accuracy_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train, zero_division=0)
f1_train = f1_score(y_train, y_pred_train, zero_division=0)

# === Calcula métricas para prueba ===
precision_test = precision_score(y_test, y_pred_test, zero_division=0)
balanced_acc_test = balanced_accuracy_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test, zero_division=0)
f1_test = f1_score(y_test, y_pred_test, zero_division=0)

# === Imprime resultados en el formato solicitado ===
print({'type': 'metrics', 'dataset': 'train',
       'precision': round(precision_train, 4),
       'balanced_accuracy': round(balanced_acc_train, 4),
       'recall': round(recall_train, 4),
       'f1_score': round(f1_train, 4)})

print({'type': 'metrics', 'dataset': 'test',
       'precision': round(precision_test, 4),
       'balanced_accuracy': round(balanced_acc_test, 4),
       'recall': round(recall_test, 4),
       'f1_score': round(f1_test, 4)})


{'type': 'metrics', 'dataset': 'train', 'precision': 0.6831, 'balanced_accuracy': np.float64(0.6204), 'recall': 0.2782, 'f1_score': 0.3954}
{'type': 'metrics', 'dataset': 'test', 'precision': 0.6883, 'balanced_accuracy': np.float64(0.6341), 'recall': 0.3054, 'f1_score': 0.4231}


In [14]:
# Step 5
import gzip 
import pickle
import os

def save_model(model,path="file path.gzip"):
    os.makedirs(os.path.dirname(path),exist_ok=True)

    with gzip.open(path,"wb") as f:
        pickle.dump(model,f)
    print("Model saved!")



In [15]:
save_model(model,"../files/models/model.pkl.gz")

Model saved!


In [16]:
# Step 6

from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score
import os
import json

def metrics_evaluation(model,X_train,y_train,X_test,y_test,data_set_name="train or test"):
        
          if data_set_name == "train":
               y_true = y_train
               y_pred = model.predict(X_train)
          elif data_set_name == "test":
               y_true = y_test
               y_pred = model.predict(X_test)
          else:
               raise ValueError("data_set_name must be 'train' o 'test'")

          #Metrics 
          precision = precision_score(y_true, y_pred)
          balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
          recall = recall_score(y_true, y_pred)
          f1 = f1_score(y_true, y_pred)

          return {
          "type": "metrics",
          'dataset': data_set_name,
          'precision': float(precision),
          'balanced_accuracy': float(balanced_accuracy),
          'recall': float(recall),
          'f1_score': float(f1)
          }

def load_metrics(path,metrics_train,metrics_test):
      os.makedirs(os.path.dirname(path),exist_ok=True)

      with open(path,'w',encoding='utf-8') as f:
            json.dump(metrics_train,f)
            f.write('\n')
            json.dump(metrics_test,f)
            f.write('\n')
      print("Metrics Saved!!")



In [17]:
metrics_train=metrics_evaluation(model,X_train,y_train,X_test,y_test,"train")
metrics_test=metrics_evaluation(model,X_train,y_train,X_test,y_test,"test")

load_metrics("../files/output/metrics.json",metrics_train,metrics_test)

Metrics Saved!!


In [18]:
# Step 7 
from sklearn.metrics import confusion_matrix

def cm_metrics(model,X_train,y_train,X_test,y_test,data_set_name="train or test"):
        
        if data_set_name == "train":
             y_true = y_train
             y_pred = model.predict(X_train)
        elif data_set_name == "test":
             y_true = y_test
             y_pred = model.predict(X_test)
        else:
            raise ValueError("data_set_name must be 'train' o 'test'")
        
        #Confusion matriz 
        cm = confusion_matrix(y_true, y_pred)

          # Desempaquetar valores (para binario)
        tn, fp, fn, tp = cm.ravel()

        return {
        "type": "cm_matrix",
        "dataset": data_set_name,
        "true_0": {"predicted_0": int(tn), "predicted_1": int(fp)},
        "true_1": {"predicted_0": int(fn), "predicted_1": int(tp)}
        }

def load_cm(path,cm_train,cm_test):
      os.makedirs(os.path.dirname(path),exist_ok=True)

      with open(path, 'a', encoding='utf-8') as f:
        json.dump(cm_train, f)
        f.write("\n")
        json.dump(cm_test, f)
        f.write("\n")
        print("Metrics saved!")

In [19]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("TN:", tn, "FP:", fp, "FN:", fn, "TP:", tp)
#TN: 6758 FP: 333 FN: 1196 TP: 713

TN: 6827 FP: 264 FN: 1326 TP: 583


In [20]:
cm_train=cm_metrics(model,X_train,y_train,X_test,y_test,data_set_name="train")

cm_test=cm_metrics(model,X_train,y_train,X_test,y_test,data_set_name="test")

load_cm("../files/output/metrics.json",cm_train,cm_test)

Metrics saved!


In [21]:
print(cm_train)

{'type': 'cm_matrix', 'dataset': 'train', 'true_0': {'predicted_0': 15663, 'predicted_1': 610}, 'true_1': {'predicted_0': 3412, 'predicted_1': 1315}}
