In [450]:
import pandas as pd
import numpy as np
import pickle

df1 = pd.read_csv("../files/input/train_data.csv.zip", index_col=False, compression="zip")
df1_ = df1.copy()
df1_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          21000 non-null  int64
 1   LIMIT_BAL                   21000 non-null  int64
 2   SEX                         21000 non-null  int64
 3   EDUCATION                   21000 non-null  int64
 4   MARRIAGE                    21000 non-null  int64
 5   AGE                         21000 non-null  int64
 6   PAY_0                       21000 non-null  int64
 7   PAY_2                       21000 non-null  int64
 8   PAY_3                       21000 non-null  int64
 9   PAY_4                       21000 non-null  int64
 10  PAY_5                       21000 non-null  int64
 11  PAY_6                       21000 non-null  int64
 12  BILL_AMT1                   21000 non-null  int64
 13  BILL_AMT2                   21000 non-null  int64
 14  BILL_A

In [451]:
df2 = pd.read_csv("../files/input/test_data.csv.zip", index_col=False, compression="zip")
df2_=df2.copy()
df2_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          9000 non-null   int64
 1   LIMIT_BAL                   9000 non-null   int64
 2   SEX                         9000 non-null   int64
 3   EDUCATION                   9000 non-null   int64
 4   MARRIAGE                    9000 non-null   int64
 5   AGE                         9000 non-null   int64
 6   PAY_0                       9000 non-null   int64
 7   PAY_2                       9000 non-null   int64
 8   PAY_3                       9000 non-null   int64
 9   PAY_4                       9000 non-null   int64
 10  PAY_5                       9000 non-null   int64
 11  PAY_6                       9000 non-null   int64
 12  BILL_AMT1                   9000 non-null   int64
 13  BILL_AMT2                   9000 non-null   int64
 14  BILL_AMT

In [452]:
def preprocess_data(df):
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns=["ID"], inplace=True)
    df.dropna(inplace=True)
    # - Para la columna EDUCATION, valores > 4 indican niveles superiores
    #   de educación, agrupe estos valores en la categoría "others".
    # - Para la columna MARRIAGE, valores iguales a 0 no tienen un significado
    #   conocido, agrupe estos valores en la categoría "others".

    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    df = df.loc[df["EDUCATION"] !=0]
    df = df.loc[df["MARRIAGE"] !=0]
    df.dropna(inplace=True)
    
    return df
df1_ = preprocess_data(df1)
df2_ = preprocess_data(df2)

In [453]:
X_train = df1_.drop(columns=["default"])
y_train = df1_["default"]
X_test = df2_.drop(columns=["default"])
y_test = df2_["default"]

In [454]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método one-hot-encoding.
#   one-hot-encoding.
# - Escala las demas variables al intervalo [0, 1].
# - Selecciona las K mejores caracteristicas.
# - Ajusta un modelo de regresion logistica.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = ["LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), numerical_features),  # Escalar variables numéricas al rango [0, 1]
        ("cat", OneHotEncoder(dtype=int, sparse_output=False), categorical_features),  # Codificación categórica
    ],
    remainder="passthrough",
)

selectkbest = SelectKBest(score_func=f_classif)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),  # Preprocesamiento
        ("feature_selection", SelectKBest(score_func=f_classif, k=20)),  # Selección de las 20 mejores características
        ("classifier", LogisticRegression(random_state=42)),  # Clasificador con pesos balanceados
    ]
)

pipeline.fit(X_train, y_train)
print("Precisión:", pipeline.score(X_test, y_test))


Precisión: 0.8236997438467535


In [455]:
'''from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
# - Transforma las variables categoricas usando el método one-hot-encoding.

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("encoder", OneHotEncoder(dtype=int, sparse_output=False)),  
            ("scaler", MinMaxScaler()),
        ]), categorical_features),
        ("kbest", SelectKBest(), categorical_features),  
    ],
    remainder="passthrough",
)

# - Escala las demas variables al intervalo [0, 1].
# - Selecciona las K mejores caracteristicas.

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),  # Preprocesamiento (categorías y numéricas)
        ("feature_selection", SelectKBest(score_func=chi2, k=20)),  # Selección de las K mejores características
        ("classifier", LogisticRegression(random_state=42))  # Modelo de regresión logística
    ]
)

pipeline.fit(X_train, y_train)
print("Precisión:", pipeline.score(X_test, y_test))'''


'from sklearn.compose import ColumnTransformer\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.feature_selection import SelectKBest, chi2\n\ncategorical_features = ["SEX", "EDUCATION", "MARRIAGE"]\n# - Transforma las variables categoricas usando el método one-hot-encoding.\n\npreprocessor = ColumnTransformer(\n    transformers=[\n        ("cat", Pipeline([\n            ("encoder", OneHotEncoder(dtype=int, sparse_output=False)),  \n            ("scaler", MinMaxScaler()),\n        ]), categorical_features),\n        ("kbest", SelectKBest(), categorical_features),  \n    ],\n    remainder="passthrough",\n)\n\n# - Escala las demas variables al intervalo [0, 1].\n# - Selecciona las K mejores caracteristicas.\n\npipeline = Pipeline(\n    steps=[\n        ("preprocessor", preprocessor),  # Preprocesamiento (categorías y numéricas)\n        

In [456]:
'''from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

# Variables categóricas
categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = ["LIMIT_BAL", "AGE", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]

# Preprocesador para transformar variables categóricas y numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(dtype=int, sparse_output=False), categorical_features),  # Codificación categórica
        ("num", MinMaxScaler(), numerical_features),  # Escalado de variables numéricas
    ]
)

# Pipeline para el preprocesamiento, selección de características y clasificación
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),  # Preprocesamiento (categorías y numéricas)
        ("feature_selection", SelectKBest(score_func=chi2, k=20)),  # Selección de las K mejores características
        ("classifier", LogisticRegression(random_state=42))  # Modelo de regresión logística
    ]
)

# Ajustar el pipeline
pipeline.fit(X_train, y_train)

# Evaluar el pipeline
print("Precisión en el conjunto de prueba:", pipeline.score(X_test, y_test))'''


'from sklearn.compose import ColumnTransformer\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import OneHotEncoder, MinMaxScaler\nfrom sklearn.feature_selection import SelectKBest, chi2\n\n# Variables categóricas\ncategorical_features = ["SEX", "EDUCATION", "MARRIAGE"]\nnumerical_features = ["LIMIT_BAL", "AGE", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]\n\n# Preprocesador para transformar variables categóricas y numéricas\npreprocessor = ColumnTransformer(\n    transformers=[\n        ("cat", OneHotEncoder(dtype=int, sparse_output=False), categorical_features),  # Codificación categórica\n        ("num", MinMaxScaler(), numerical_features),  # Escalado de variables numéricas\n    ]\n)\n\n# Pipeline para el preprocesamiento, selección de características y clasificación\npipeline = Pipeline(\n    steps=[\n        ("preprocessor", preprocessor),  # Preprocesamiento (categorías y numéricas

In [457]:
from sklearn.model_selection import GridSearchCV
# definamos nuevos hiperparametros para la regresion logistica


param_grid = {
    "classifier__C": [0.001, 0.1, 1, 10, 100, 1000],
    "classifier__max_iter": [100, 200, 300, 400, 500],
    "feature_selection__k": range(1, 31),
    "classifier__penalty": ["l1", "l2"],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True,
    verbose=True,
)

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


9000 fits failed out of a total of 18000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9000 fits failed with the following error:
Traceback (most recent call last):
  File "d:\ever\PhD\Predictiva\20204-2-LAB-02-prediccion-del-default-usando-logreg-Ever708ch\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\ever\PhD\Predictiva\20204-2-LAB-02-prediccion-del-default-usando-logreg-Ever708ch\.venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\ever\PhD\Predictiva\20204-2-LAB-02-prediccion-del-default-usando-logreg-Ever708ch\.venv\lib\site-packages\sklearn\pipeline.py

In [458]:
import joblib
import os
import pickle
import gzip

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

In [459]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
)
import json

y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": float(precision_score(y_train, y_train_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_train, y_train_pred)),
        "recall": float(recall_score(y_train, y_train_pred)),
        "f1_score": float(f1_score(y_train, y_train_pred)),
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": float(precision_score(y_test, y_test_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_test, y_test_pred)),
        "recall": float(recall_score(y_test, y_test_pred)),
        "f1_score": float(f1_score(y_test, y_test_pred)),
    },
]

In [460]:
from sklearn.metrics import confusion_matrix
import os

train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

confusion_matrices = [
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {
            "predicted_0": int(train_cm[0, 0]),
            "predicted_1": int(train_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(train_cm[1, 0]),
            "predicted_1": int(train_cm[1, 1]),
        },
    },
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {
            "predicted_0": int(test_cm[0, 0]),
            "predicted_1": int(test_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(test_cm[1, 0]),
            "predicted_1": int(test_cm[1, 1]),
        },
    },
]

output_file = "../files/output/metrics.json"
os.makedirs("../files/output", exist_ok=True)

output_data = metrics + confusion_matrices

with open(output_file, "w") as f:
    for item in output_data:
        f.write(str(item).replace("'", '"') + "\n")
