In [2]:
import pandas as pd

from homework.homework import (
    load_data,
    clean_data,
    split,
    build_pipeline,
    tune_hyperparameters,
    compute_metrics,
    compute_confusion,
    save_model,
    save_metrics,
)


In [3]:
df_train, df_test = load_data()

df_train_clean = clean_data(df_train)
df_test_clean = clean_data(df_test)

X_train, y_train = split(df_train_clean)
X_test, y_test = split(df_test_clean)

df_train_clean.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [4]:
pipeline = build_pipeline()

grid = tune_hyperparameters(
    model=pipeline,
    n_splits=10,
    x_train=X_train,
    y_train=y_train,
    scoring="balanced_accuracy",
)

print("Mejor balanced accuracy (CV):", grid.best_score_)
print("Mejores hiperparámetros:", grid.best_params_)


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Mejor balanced accuracy (CV): 0.6501697298087941
Mejores hiperparámetros: {'classifier__gamma': 0.099, 'classifier__kernel': 'rbf', 'pca__n_components': 20, 'selectkbest__k': 12}


In [5]:
y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

In [6]:
train_metrics = compute_metrics("train", y_train, y_pred_train)
test_metrics = compute_metrics("test", y_test, y_pred_test)

train_cm = compute_confusion("train", y_train, y_pred_train)
test_cm = compute_confusion("test", y_test, y_pred_test)

train_metrics, test_metrics, train_cm, test_cm


({'type': 'metrics',
  'dataset': 'train',
  'precision': 0.7025742574257425,
  'balanced_accuracy': 0.6645858487453262,
  'recall': 0.3754497354497354,
  'f1_score': 0.48937931034482757},
 {'type': 'metrics',
  'dataset': 'test',
  'precision': 0.6742909423604757,
  'balanced_accuracy': 0.6681707063602494,
  'recall': 0.38667366211962223,
  'f1_score': 0.4914971657219073},
 {'type': 'cm_matrix',
  'dataset': 'train',
  'true_0': {'predicted_0': 15477, 'predicted_1': 751},
  'true_1': {'predicted_0': 2951, 'predicted_1': 1774}},
 {'type': 'cm_matrix',
  'dataset': 'test',
  'true_0': {'predicted_0': 6717, 'predicted_1': 356},
  'true_1': {'predicted_0': 1169, 'predicted_1': 737}})

In [7]:
save_model(grid, "files/models/model.pkl.gz")

results = [train_metrics, test_metrics, train_cm, test_cm]
save_metrics(results, "files/output/metrics.json")
