In [62]:
import pandas as pd 
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

In [63]:
# Lectura de bases de datos
train = pd.read_csv('../files/input/train_data.csv.zip')
test = pd.read_csv('../files/input/test_data.csv.zip')
train.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [64]:
# Rename
train = train.rename(columns={'default payment next month': 'default'})
test = test.rename(columns={'default payment next month': 'default'})

# Drop ID
train = train.drop(columns=['ID'])
test = test.drop(columns=['ID'])


In [65]:
## variables con valores nulos para test
test.isnull().sum()

LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
default      0
dtype: int64

In [66]:
## variables con valores nulos para train
train.isnull().sum()

LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
default      0
dtype: int64

In [67]:
## Eliminando en EDUCATION y MARRIEAGE los valores en 0 que representan NA
train = train[(train['EDUCATION'] != 0) & (train['MARRIAGE'] != 0)]
test = test[(test['EDUCATION'] != 0) & (test['MARRIAGE'] != 0)]

In [68]:
def niveles_educacion(df, column = "EDUCATION"):
    df[column] = df[column].apply(lambda x: 4 if x > 4 else x)
    return df

train = niveles_educacion(train)
test = niveles_educacion(test)

In [69]:
test['EDUCATION'].value_counts()

EDUCATION
2    4268
1    3105
3    1477
4     129
Name: count, dtype: int64

In [70]:
## variables categoricas
def convertir_categoricas(dataset):
    df = dataset.copy()
    df['SEX'] = df['SEX'].astype('category')
    df['EDUCATION'] = df['EDUCATION'].astype('category')
    df['MARRIAGE'] = df['MARRIAGE'].astype('category')
    df['default'] = df['default'].astype('category')
    return df

train = convertir_categoricas(dataset=train)
test = convertir_categoricas(dataset=test)

In [71]:
## División de datos para train y test
x_train = train.drop(columns=['default'], axis=1)
y_train = train['default']

x_test = test.drop(columns=['default'], axis=1)
y_test = test['default']

In [72]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler  
from sklearn.feature_selection import SelectKBest, f_classif

In [73]:
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']

preprocessor = ColumnTransformer(
    transformers = [
        ('categorias', OneHotEncoder(drop='if_binary'), categorical_features)
    ],
    remainder=MinMaxScaler()
)

k_best = SelectKBest(score_func=f_classif, k=10)

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('feature_selection', k_best),
        ('classifier', LogisticRegression(penalty='l2', n_jobs=-1, random_state=666, max_iter=7000))
        
    ]
)


In [74]:
x_transformed = preprocessor.fit_transform(x_train)

x_transformed = pd.DataFrame(x_transformed, columns=preprocessor.get_feature_names_out())
len(x_transformed.columns)

28

In [75]:
k_best.fit(x_train, y_train)

feature_names = x_train.columns

scores = pd.DataFrame({
    'Feature': feature_names,
    'F-Score': k_best.scores_,
    'P-Value': k_best.pvalues_
})

scores = scores.sort_values(by='F-Score', ascending=False)
scores

Unnamed: 0,Feature,F-Score,P-Value
5,PAY_0,2422.194387,0.0
6,PAY_2,1505.460155,4.290767e-318
7,PAY_3,1227.709807,1.969449e-261
8,PAY_4,1019.048942,2.174002e-218
9,PAY_5,902.639845,3.403415e-194
10,PAY_6,743.220697,7.614765e-161
0,LIMIT_BAL,497.499225,6.164117e-109
17,PAY_AMT1,101.295942,8.971592e-24
20,PAY_AMT4,80.341831,3.4072739999999998e-19
21,PAY_AMT5,67.66876,2.045689e-16


In [78]:
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV

param_grid = {
    'feature_selection__k': [1, 2, 4, 7],
    'classifier__C':[1, 100],
    'classifier__solver':['lbfgs', 'sag']
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='balanced_accuracy',
    n_jobs=-1,
    refit=True
)

model.fit(x_train, y_train)


In [79]:
print('mejores parámetros:', model.best_params_)
print('mejor exactitud validación cruzada', model.best_score_)
print('exactitud dataset de entrenamiento:', model.score(x_train, y_train))
print('exactitud dataset de prueba:', model.score(x_test, y_test))

mejores parámetros: {'classifier__C': 1, 'classifier__solver': 'lbfgs', 'feature_selection__k': 1}
mejor exactitud validación cruzada 0.6392688664250823
exactitud dataset de entrenamiento: 0.6392682710528409
exactitud dataset de prueba: 0.6547057822566611


In [80]:
import pickle
import gzip
import os

os.makedirs('../files/models', exist_ok=True)

with gzip.open("../files/models/model.pkl.gz","wb") as file:
    pickle.dump(model, file)

In [81]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score

def calcular_metricas(modelo, x, y, tipo):
    y_pred = modelo.predict(x)
    metrics = {
    "type": "metrics",
    'dataset': tipo,
    'precision': precision_score(y, y_pred),
    'balanced_accuracy': balanced_accuracy_score(y, y_pred),
    'recall': recall_score(y, y_pred),
    'f1_score': f1_score(y, y_pred)
    }
    return metrics

train_metrics = calcular_metricas(modelo=model , x=x_train, y=y_train, tipo='train')
test_metrics = calcular_metricas(modelo=model , x=x_test, y=y_test, tipo='test')

metricas = [train_metrics, test_metrics]

output_dir = '../files/output'
os.makedirs(output_dir, exist_ok=True)

with open('../files/output/metrics.json', 'w') as file:
    for metrica in metricas:
        file.write(json.dumps(metrica)+ '\n')

In [82]:
from sklearn.metrics import confusion_matrix

def matriz_confusion(modelo, x, y, tipo):
    y_pred = modelo.predict(x)
    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': tipo,
        'true_0': {'predicted_0': int(cm[0, 0]), 'predicted_1': int(cm[0, 1])},
        'true_1': {'predicted_0': int(cm[1, 0]), 'predicted_1': int(cm[1, 1])}
    }
    return cm_dict

train_cm = matriz_confusion(modelo=model, x=x_train, y=y_train, tipo='train')
test_cm = matriz_confusion(modelo=model, x=x_test, y=y_test, tipo='test')

metricas_extendidas = [train_cm, test_cm]

with open('../files/output/metrics.json', 'a') as file:
    for metrica in metricas_extendidas:
        file.write(json.dumps(metrica) + '\n')