In [1]:
import pandas as pd
import numpy as np
import pickle

df1 = pd.read_csv("../files/input/train_data.csv.zip", index_col=False, compression="zip")
df1_ = df1.copy()
df1_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          21000 non-null  int64
 1   LIMIT_BAL                   21000 non-null  int64
 2   SEX                         21000 non-null  int64
 3   EDUCATION                   21000 non-null  int64
 4   MARRIAGE                    21000 non-null  int64
 5   AGE                         21000 non-null  int64
 6   PAY_0                       21000 non-null  int64
 7   PAY_2                       21000 non-null  int64
 8   PAY_3                       21000 non-null  int64
 9   PAY_4                       21000 non-null  int64
 10  PAY_5                       21000 non-null  int64
 11  PAY_6                       21000 non-null  int64
 12  BILL_AMT1                   21000 non-null  int64
 13  BILL_AMT2                   21000 non-null  int64
 14  BILL_A

In [2]:
df2 = pd.read_csv("../files/input/test_data.csv.zip", index_col=False, compression="zip")
df2_=df2.copy()
df2_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          9000 non-null   int64
 1   LIMIT_BAL                   9000 non-null   int64
 2   SEX                         9000 non-null   int64
 3   EDUCATION                   9000 non-null   int64
 4   MARRIAGE                    9000 non-null   int64
 5   AGE                         9000 non-null   int64
 6   PAY_0                       9000 non-null   int64
 7   PAY_2                       9000 non-null   int64
 8   PAY_3                       9000 non-null   int64
 9   PAY_4                       9000 non-null   int64
 10  PAY_5                       9000 non-null   int64
 11  PAY_6                       9000 non-null   int64
 12  BILL_AMT1                   9000 non-null   int64
 13  BILL_AMT2                   9000 non-null   int64
 14  BILL_AMT

In [3]:
def preprocess_data(df):
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns=["ID"], inplace=True)
    df.dropna(inplace=True)
    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    df = df.loc[df["EDUCATION"] !=0]
    df = df.loc[df["MARRIAGE"] !=0]
    return df
df1_ = preprocess_data(df1)
df2_ = preprocess_data(df2)
df2_['SEX'].unique()

array([2, 1])

In [4]:
X_train = df1_.drop(columns=["default"])
y_train = df1_["default"]
X_test = df2_.drop(columns=["default"])
y_test = df2_["default"]

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]

preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(dtype=int), categorical_features)],
    remainder="passthrough",
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

pipeline.fit(X_train, y_train)
print("Precisión:", pipeline.score(X_test, y_test))

Precisión: 0.8250361955674351


In [6]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__n_estimators": [200, 250],
    "classifier__max_depth": [10, None],
    "classifier__min_samples_split": [10],
    "classifier__min_samples_leaf": [4],
    "classifier__max_features": ["auto", "sqrt"],
    "classifier__bootstrap": [True, False],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True,
    verbose=True,
)

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
42 fits failed with the following error:
Traceback (most recent call last):
  File "d:\ever\PhD\Predictiva\2024-2-LAB-01-prediccion-del-default-usando-rf-Ever708ch\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\ever\PhD\Predictiva\2024-2-LAB-01-prediccion-del-default-usando-rf-Ever708ch\.venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\ever\PhD\Predictiva\2024-2-LAB-01-prediccion-del-default-usando-rf-Ever708ch\.venv\lib\site-packages\sklearn\pipeline.py", line 660, in fit
 

In [18]:
import joblib
import os
import pickle
import gzip

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

In [19]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
)
import json

y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": float(precision_score(y_train, y_train_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_train, y_train_pred)),
        "recall": float(recall_score(y_train, y_train_pred)),
        "f1_score": float(f1_score(y_train, y_train_pred)),
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": float(precision_score(y_test, y_test_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_test, y_test_pred)),
        "recall": float(recall_score(y_test, y_test_pred)),
        "f1_score": float(f1_score(y_test, y_test_pred)),
    },
]


In [29]:
from sklearn.metrics import confusion_matrix
import os

train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

confusion_matrices = [
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {
            "predicted_0": int(train_cm[0, 0]),
            "predicted_1": int(train_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(train_cm[1, 0]),
            "predicted_1": int(train_cm[1, 1]),
        },
    },
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {
            "predicted_0": int(test_cm[0, 0]),
            "predicted_1": int(test_cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(test_cm[1, 0]),
            "predicted_1": int(test_cm[1, 1]),
        },
    },
]

output_file = "../files/output/metrics.json"
os.makedirs("../files/output", exist_ok=True)

output_data = metrics + confusion_matrices

with open(output_file, "w") as f:
    for item in output_data:
        f.write(str(item).replace("'", '"') + "\n")


In [30]:
# def make_train_test_split(train_data, test_data, target_column):

#     # Separar características (X) y etiquetas (y) del conjunto de entrenamiento
#     X_train = train_data.drop(columns=[target_column])
#     y_train = train_data[target_column]
    
#     # Separar características (X) y etiquetas (y) del conjunto de prueba
#     X_test = test_data.drop(columns=[target_column])
#     y_test = test_data[target_column]
    
#     return X_train, X_test, y_train, y_test

# # Llamar a la función con tus datos
# X_train, X_test, y_train, y_test = make_train_test_split(
#     train_data=df1_,
#     test_data=df2_,
#     target_column='default'
# )

In [31]:
# def make_pipeline(estimator):

#     from sklearn.compose import ColumnTransformer
#     from sklearn.feature_selection import SelectKBest, f_classif
#     from sklearn.pipeline import Pipeline
#     from sklearn.preprocessing import OneHotEncoder, StandardScaler
    
#     categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
    
#     transformer = ColumnTransformer(
#         transformers=[
#            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
          
#         ],
#         remainder='passthrough',
#     )

#     selectkbest = SelectKBest(score_func=f_classif)

#     pipeline = Pipeline(
#         steps=[
#             ("tranformer", transformer),
#             ("selectkbest", selectkbest),
#             ("estimator", estimator),
#         ],
#         verbose=False,
#     )

#     return pipeline

In [32]:
# def make_grid_search(estimator, param_grid, cv=10):

#     from sklearn.model_selection import GridSearchCV

#     grid_search = GridSearchCV(
#         estimator=estimator,
#         param_grid=param_grid,
#         cv=cv,
#         scoring='balanced_accuracy',
#         n_jobs=-1,
#         refit=True,
#         verbose=True,
#     )

#     return grid_search

In [33]:
# def save_estimator(estimator):
#     import os
#     import gzip
#     import pickle

#     # Crear la carpeta si no existe
#     os.makedirs("../files/models", exist_ok=True)

#     # Guardar el modelo con compresión gzip
#     with gzip.open("../files/models/model.pkl.gz", "wb") as file:
#         pickle.dump(estimator, file)


In [34]:
# def load_estimator():
#     import os
#     import gzip
#     import pickle

#     # Verificar si el archivo existe
#     if not os.path.exists("../files/models/model.pkl.gz"):
#         return None

#     # Cargar el modelo desde un archivo comprimido
#     with gzip.open("../files/models/model.pkl.gz", "rb") as file:
#         estimator = pickle.load(file)

#     return estimator


In [35]:
# def train_RandomForestClassifier():

#     from sklearn.ensemble import RandomForestClassifier
#     from sklearn.metrics import mean_absolute_error

#     # data, target = load_data()

#     X_train, X_test, y_train, y_test = make_train_test_split(
#             train_data=df1_,
#             test_data=df2_,
#             target_column='default'
#     )

#     pipeline = make_pipeline(
#         estimator=RandomForestClassifier(random_state=42),
#     )

#     estimator = make_grid_search(
#         estimator=pipeline,
#         param_grid={
#                 'estimator__n_estimators': [500],
#                 'estimator__max_depth': [250],
#                 'estimator__min_samples_split': [5],
#                 'estimator__min_samples_leaf': [2],
                
#         },
#         cv=10,
#     )

#     estimator.fit(X_train, y_train)
#     print(estimator.fit(X_train, y_train))
#     best_estimator = load_estimator()
    
#     if best_estimator is not None:

#         saved_mae = mean_absolute_error(
#             y_true=y_test, y_pred=best_estimator.predict(X_test)
#         )

#         current_mae = mean_absolute_error(
#             y_true=y_test, y_pred=estimator.predict(X_test)
#         )

#         if saved_mae < current_mae:
#             estimator = best_estimator

#     save_estimator(estimator)


# train_RandomForestClassifier()

In [36]:
# # Ver todas las combinaciones evaluadas por GridSearchCV
# print("Resultados de GridSearchCV:")
# for params, mean_score in zip(estimator.cv_results_['params'], estimator.cv_results_['mean_test_score']):
#     print(f"Parámetros: {params}, Precisión balanceada: {mean_score}")


In [37]:
# import os
# import json
# from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix

# def save_metrics_and_confusion_matrices(estimator, X_train, y_train, X_test, y_test):
#     """
#     Genera las métricas y matrices de confusión y las guarda en 'files/output/metrics.json'.
#     """
#     if estimator is None:
#         raise ValueError("El modelo no se pudo cargar. Asegúrate de haber guardado el modelo correctamente.")
    
#     # Predicciones
#     y_train_pred = estimator.predict(X_train)
#     y_test_pred = estimator.predict(X_test)

# # Calcular métricas para la clase minoritaria (clase 1)
#     metrics_train = {
#     "type": "metrics",
#     "dataset": "train",
#     "precision": precision_score(y_train, y_train_pred, average='binary', pos_label=0),
#     "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
#     "recall": recall_score(y_train, y_train_pred, average='binary', pos_label=0),
#     "f1_score": f1_score(y_train, y_train_pred, average='binary', pos_label=0)
#     }

#     metrics_test = {
#     "type": "metrics",
#     "dataset": "test",
#     "precision": precision_score(y_test, y_test_pred, average='binary', pos_label=0),
#     "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
#     "recall": recall_score(y_test, y_test_pred, average='binary', pos_label=0),
#     "f1_score": f1_score(y_test, y_test_pred, average='binary', pos_label=0)
#     }

#     # Calcular matrices de confusión
#     def cm_to_dict(cm, dataset_name):
#         return {
#             "type": "cm_matrix",
#             "dataset": dataset_name,
#             "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
#             "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
#         }
    
#     cm_train = confusion_matrix(y_train, y_train_pred)
#     cm_test = confusion_matrix(y_test, y_test_pred)
    
#     cm_train_dict = {
#         "type": "cm_matrix",
#         "dataset": "train",
#         "true_0": {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
#         "true_1": {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])},
#     }
#     cm_test_dict = {
#         "type": "cm_matrix",
#         "dataset": "test",
#         "true_0": {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
#         "true_1": {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])},
#     }

#     # Guardar métricas y matrices en un archivo JSON
#     output_directory = os.path.abspath("../files/output")
#     output_file = "metrics.json"
#     output_path = os.path.join(output_directory, output_file)

#     if not os.path.exists(output_directory):
#         os.makedirs(output_directory, exist_ok=True)

#     try:
#         with open(output_path, "w", encoding="utf-8") as file:
#             file.write(json.dumps(metrics_train) + "\n")
#             file.write(json.dumps(metrics_test) + "\n")
#             file.write(json.dumps(cm_train_dict) + "\n")
#             file.write(json.dumps(cm_test_dict) + "\n")
#         print(f"Métricas y matrices de confusión guardadas correctamente en: {output_path}")
#     except Exception as e:
#         print(f"Error al guardar las métricas y matrices de confusión: {e}")
    
#     # Imprimir el reporte de clasificación para verificación
#     print("Reporte de clasificación (entrenamiento):")
#     print(classification_report(y_train, y_train_pred))
#     print("\nReporte de clasificación (prueba):")
#     print(classification_report(y_test, y_test_pred))


# # Ejemplo de uso
# estimator = load_estimator()
# save_metrics_and_confusion_matrices(estimator, X_train, y_train, X_test, y_test)