In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
import gzip
import json
import os
import pickle


In [2]:
# Load datasets
train_data = pd.read_csv('../files/input/train_data.csv.zip')
test_data = pd.read_csv('../files/input/test_data.csv.zip')

# Step 1: Data Cleaning
def clean_data(df):
    df = df.rename(columns={'default payment next month': 'default'})
    df = df.drop(columns=['ID'])
    df = df.dropna()
    df = df.iloc[df[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)].index]    
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: x if x <= 4 else 4)
    return df

train_data = clean_data(train_data)
test_data = clean_data(test_data)


In [3]:
# Step 2: Split datasets
x_train = train_data.drop(columns=['default'])
y_train = train_data['default']
x_test = test_data.drop(columns=['default'])
y_test = test_data['default']

In [4]:
# Step 3: Create pipeline
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numeric_features = [col for col in x_train.columns if col not in categorical_features]

# Crear el transformador para las columnas categóricas y numericas 
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        
    ],
    remainder='passthrough' 

)

## selecciona las mejores k variables 
k_best_selector = SelectKBest(score_func=f_classif, k=1)


# Crear el pipeline con preprocesamiento y el modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kbest', k_best_selector),
    ('num', MinMaxScaler()),
    ('estimator', LogisticRegression(n_jobs=-1, random_state=666,class_weight=None))  # Establecer el estimador que se pasa como argumento
],
verbose=False)


In [5]:
# Step 4: Hyperparameter optimization
param_grid = {
        'estimator__C': [1],
        'estimator__solver': ['lbfgs']
    }


grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='balanced_accuracy')
grid_search.fit(x_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best balanced accuracy score: ", grid_search.best_score_)


Best parameters found:  {'estimator__C': 1, 'estimator__solver': 'lbfgs'}
Best balanced accuracy score:  0.6392688664250823


In [6]:
# Save the model
output_dir = '../files/models'
os.makedirs(output_dir, exist_ok=True)
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(grid_search, f)

In [7]:
# Step 5: Calculate metrics
def calculate_metrics(model, x, y, dataset_type):
    y_pred = model.predict(x)
    metrics = {
        'type': 'metrics',
        'dataset': dataset_type,
        'precision': precision_score(y, y_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y, y_pred),
        'recall': recall_score(y, y_pred, zero_division=0),
        'f1_score': f1_score(y, y_pred, zero_division=0)
    }
    return metrics

train_metrics = calculate_metrics(grid_search.best_estimator_, x_train, y_train, 'train')
test_metrics = calculate_metrics(grid_search.best_estimator_, x_test, y_test, 'test')

In [8]:
# Step 6: Save metrics
# Guardar métricas
output_dir = '../files/output'
os.makedirs(output_dir, exist_ok=True)
metrics = [train_metrics, test_metrics]
with open('../files/output/metrics.json', 'w') as f:
    for metric in metrics:
        f.write(json.dumps(metric) + '\n')


In [9]:
# Step 7: Calculate confusion matrices
def calculate_confusion_matrix(model, x, y, dataset_type):
    y_pred = model.predict(x)
    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': dataset_type,
        'true_0': {'predicted_0': int(cm[0, 0]), 'predicted_1': int(cm[0, 1])},
        'true_1': {'predicted_0': int(cm[1, 0]), 'predicted_1': int(cm[1, 1])}
    }
    return cm_dict

train_cm = calculate_confusion_matrix(grid_search.best_estimator_, x_train, y_train, 'train')
test_cm = calculate_confusion_matrix(grid_search.best_estimator_, x_test, y_test, 'test')

In [10]:
# Guardar matrices de confusión
metrics_extend = [train_cm, test_cm]
with open('../files/output/metrics.json', 'a') as f:
    for metric in metrics_extend:
        f.write(json.dumps(metric) + '\n')