In [76]:
# Load de data

import zipfile
import pandas as pd

def load_data(zip_path):
    try:
        with zipfile.ZipFile(zip_path,"r") as z:
            csv_filename=z.namelist()[0]

            with z.open(csv_filename) as f:
                df=pd.read_csv(f)
        print(f"File {csv_filename} uploaded succesfully")
        return df
    except Exception as e:
        print(f" Error loagind the file {e}")




In [77]:
test_data=load_data("../files/input/test_data.csv.zip")
train_data=load_data("../files/input/train_data.csv.zip")

File test_default_of_credit_card_clients.csv uploaded succesfully
File train_default_of_credit_card_clients.csv uploaded succesfully


In [78]:
test_data.shape
train_data.shape


(21000, 25)

In [79]:
# Step 1
def data_cleaning(dataset, name="dataset"):
    dataset.rename(columns={"default payment next month":"default"}, inplace=True)
    dataset.drop(columns="ID",inplace=True)
    dataset.dropna(inplace=True)
    dataset.loc[dataset["EDUCATION"]>4,"EDUCATION"]=4
    print(f"Data {name} cleaned successfully")
    return dataset


In [80]:
train_data=data_cleaning(train_data,"train_data")
test_data=data_cleaning(test_data,"test_data")

Data train_data cleaned successfully
Data test_data cleaned successfully


In [81]:
train_data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [82]:
# Step 2
def split_data(train_data,test_data):
    X_train=train_data.drop(columns="default")
    X_test=test_data.drop(columns="default")

    y_train=train_data["default"]
    y_test=test_data["default"]

    print("Correctly split dataset")

    return X_train,y_train,X_test,y_test


In [83]:
X_train, y_train, X_test, y_test = split_data(train_data,test_data)

Correctly split dataset


In [84]:
# Step 3
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC

def classification_pipeline(X_train, y_train):

    categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE']
    numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('scaler', StandardScaler(with_mean=True, with_std=True), numeric_cols),
        ]
    )

    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA()),  
        ('feature_selection', SelectKBest(score_func=f_classif)),
        ('classifier', SVC( random_state=42))
    ])

    
    model.fit(X_train, y_train)
    
    return model



In [85]:
model=classification_pipeline(X_train,y_train)

In [None]:
# Step #4

from sklearn.model_selection import GridSearchCV, KFold
import numpy as np

def Hyperparameter_optimization(model, X_train, y_train):

    param_grid = {
    'pca__n_components':[20],
    'feature_selection__k':[12], 
    'classifier__kernel': ['rbf'], 
    'classifier__gamma': [0.099],
}


    
    cv = KFold(n_splits=10, shuffle=True, random_state=132)

    # GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='balanced_accuracy',
        cv=cv,
        n_jobs=-1,
        verbose=1,
        refit=True
    )

    # Ajustar el modelo
    grid_search.fit(X_train, y_train)

    return grid_search


In [87]:
model=Hyperparameter_optimization(model,X_train,y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [88]:
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score

#predicciones
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

#métricas para entrenamiento
precision_train = precision_score(y_train, y_pred_train, zero_division=0)
balanced_acc_train = balanced_accuracy_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train, zero_division=0)
f1_train = f1_score(y_train, y_pred_train, zero_division=0)

#métricas para prueba
precision_test = precision_score(y_test, y_pred_test, zero_division=0)
balanced_acc_test = balanced_accuracy_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test, zero_division=0)
f1_test = f1_score(y_test, y_pred_test, zero_division=0)

#resultados
print({'type': 'metrics', 'dataset': 'train',
       'precision': round(precision_train, 4),
       'balanced_accuracy': round(balanced_acc_train, 4),
       'recall': round(recall_train, 4),
       'f1_score': round(f1_train, 4)})

print({'type': 'metrics', 'dataset': 'test',
       'precision': round(precision_test, 4),
       'balanced_accuracy': round(balanced_acc_test, 4),
       'recall': round(recall_test, 4),
       'f1_score': round(f1_test, 4)})


{'type': 'metrics', 'dataset': 'train', 'precision': 0.7016, 'balanced_accuracy': 0.6644, 'recall': 0.3751, 'f1_score': 0.4888}
{'type': 'metrics', 'dataset': 'test', 'precision': 0.6743, 'balanced_accuracy': 0.6675, 'recall': 0.385, 'f1_score': 0.4902}


In [89]:
# Step 5
import gzip 
import pickle
import os

def save_model(model,path="file path.gzip"):
    os.makedirs(os.path.dirname(path),exist_ok=True)

    with gzip.open(path,"wb") as f:
        pickle.dump(model,f)
    print("Model saved!")



In [90]:
save_model(model,"../files/models/model.pkl.gz")

Model saved!


In [91]:
# Step 6

from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score
import os
import json

def metrics_evaluation(model,X_train,y_train,X_test,y_test,data_set_name="train or test"):
        
          if data_set_name == "train":
               y_true = y_train
               y_pred = model.predict(X_train)
          elif data_set_name == "test":
               y_true = y_test
               y_pred = model.predict(X_test)
          else:
               raise ValueError("data_set_name must be 'train' o 'test'")

          #Metrics 
          precision = precision_score(y_true, y_pred)
          balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
          recall = recall_score(y_true, y_pred)
          f1 = f1_score(y_true, y_pred)

          return {
          "type": "metrics",
          'dataset': data_set_name,
          'precision': float(precision),
          'balanced_accuracy': float(balanced_accuracy),
          'recall': float(recall),
          'f1_score': float(f1)
          }

def load_metrics(path,metrics_train,metrics_test):
      os.makedirs(os.path.dirname(path),exist_ok=True)

      with open(path,'w',encoding='utf-8') as f:
            json.dump(metrics_train,f)
            f.write('\n')
            json.dump(metrics_test,f)
            f.write('\n')
      print("Metrics Saved!!")



In [92]:
metrics_train=metrics_evaluation(model,X_train,y_train,X_test,y_test,"train")
metrics_test=metrics_evaluation(model,X_train,y_train,X_test,y_test,"test")

load_metrics("../files/output/metrics.json",metrics_train,metrics_test)

Metrics Saved!!


In [93]:
# Step 7 
from sklearn.metrics import confusion_matrix

def cm_metrics(model,X_train,y_train,X_test,y_test,data_set_name="train or test"):
        
        if data_set_name == "train":
             y_true = y_train
             y_pred = model.predict(X_train)
        elif data_set_name == "test":
             y_true = y_test
             y_pred = model.predict(X_test)
        else:
            raise ValueError("data_set_name must be 'train' o 'test'")
        
        #Confusion matriz 
        cm = confusion_matrix(y_true, y_pred)

          # Desempaquetar valores (para binario)
        tn, fp, fn, tp = cm.ravel()

        return {
        "type": "cm_matrix",
        "dataset": data_set_name,
        "true_0": {"predicted_0": int(tn), "predicted_1": int(fp)},
        "true_1": {"predicted_0": int(fn), "predicted_1": int(tp)}
        }

def load_cm(path,cm_train,cm_test):
      os.makedirs(os.path.dirname(path),exist_ok=True)

      with open(path, 'a', encoding='utf-8') as f:
        json.dump(cm_train, f)
        f.write("\n")
        json.dump(cm_test, f)
        f.write("\n")
        print("Metrics saved!")

In [94]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("TN:", tn, "FP:", fp, "FN:", fn, "TP:", tp)
#TN: 6758 FP: 333 FN: 1196 TP: 713

TN: 6736 FP: 355 FN: 1174 TP: 735


In [95]:
cm_train=cm_metrics(model,X_train,y_train,X_test,y_test,data_set_name="train")

cm_test=cm_metrics(model,X_train,y_train,X_test,y_test,data_set_name="test")

load_cm("../files/output/metrics.json",cm_train,cm_test)

Metrics saved!


In [96]:
print(cm_train)

{'type': 'cm_matrix', 'dataset': 'train', 'true_0': {'predicted_0': 15519, 'predicted_1': 754}, 'true_1': {'predicted_0': 2954, 'predicted_1': 1773}}
