In [None]:
import pandas as pd
import numpy as np
import gzip
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
import json


def load_data():
    train_df = pd.read_csv("./files/input/train_data.csv.zip", index_col=False, compression="zip")
    test_df = pd.read_csv("./files/input/test_data.csv.zip", index_col=False, compression="zip")
    return train_df, test_df

def clean_data(df):
    df = df.rename(columns={"default payment next month": "default"})
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])
    df = df.dropna()
    # Group EDUCATION > 4 into category 4 ("others")
    if 'EDUCATION' in df.columns:
        df['EDUCATION'] = df['EDUCATION'].apply(lambda x: x if x <= 4 else 4)
    return df

def create_pipeline():
    # Define categorical columns to one-hot encode
    categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE']
    # Only keep columns that actually exist in the dataframe (robustness)
    # The ColumnTransformer will be built later in main where we have the dataframe columns.
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', ohe, categorical_cols)
        ],
        remainder='passthrough'  # keep numeric columns as-is
    )

    pipeline = Pipeline([
        ('preproc', preprocessor),
        ('pca', PCA()),               # uses all components by default unless tuned
        ('scaler', MinMaxScaler()),   # scale to [0,1]
        ('selectkbest', SelectKBest(score_func=f_classif)),  # k will be tuned
        ('mlp', MLPClassifier(max_iter=500))
    ])
    return pipeline

def optimize_hyperparameters(pipeline, X_train, y_train):
    # Parameter grid
    param_grid = {
        'pca__n_components': [5, 10, 15],
        'selectkbest__k': [5, 10, 15],
        'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'mlp__alpha': [0.0001, 0.001, 0.01]
    }
    # If some parameter combinations are invalid (e.g., k > n_components) don't raise — mark as NaN
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1, error_score=np.nan)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

def save_model(model, file_path):
    with gzip.open(file_path, 'wb') as f:
        pickle.dump(model, f)

def calculate_metrics(model, X, y, dataset_type):
    y_pred = model.predict(X)
    precision = precision_score(y, y_pred, zero_division=0)
    balanced_accuracy = balanced_accuracy_score(y, y_pred)
    recall = recall_score(y, y_pred, zero_division=0)
    f1 = f1_score(y, y_pred, zero_division=0)
    cm = confusion_matrix(y, y_pred)
    metrics = {
        'dataset': dataset_type,
        'precision': float(precision),
        'balanced_accuracy': float(balanced_accuracy),
        'recall': float(recall),
        'f1_score': float(f1)
    }
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': dataset_type,
        'true_0': {'predicted_0': int(cm[0, 0]), 'predicted_1': int(cm[0, 1])},
        'true_1': {'predicted_0': int(cm[1, 0]), 'predicted_1': int(cm[1, 1])}
    }
    return metrics, cm_dict

def save_metrics(metrics_list, file_path):
    with open(file_path, 'w') as f:
        for metrics in metrics_list:
            f.write(json.dumps(metrics) + '\n')

def main():
    train_df, test_df = load_data()
    train_df = clean_data(train_df)
    test_df = clean_data(test_df)

    # Prepare X/y
    X_train, y_train = train_df.drop(columns=["default"]), train_df["default"]
    X_test, y_test = test_df.drop(columns=["default"]), test_df["default"]

    # Build pipeline (ColumnTransformer in pipeline expects the categorical columns to exist;
    # if some of the categorical columns are missing the transformer will raise — ensure your dataset includes them)
    pipeline = create_pipeline()

    best_model = optimize_hyperparameters(pipeline, X_train, y_train)

    save_model(best_model, './files/models/model.pkl.gz')

    metrics_list = []
    train_metrics, train_cm = calculate_metrics(best_model, X_train, y_train, 'train')
    test_metrics, test_cm = calculate_metrics(best_model, X_test, y_test, 'test')
    metrics_list.append(train_metrics)
    metrics_list.append(test_metrics)
    metrics_list.append(train_cm)
    metrics_list.append(test_cm)
    save_metrics(metrics_list, './files/output/metrics.json')

if __name__ == "__main__":
    main()


In [1]:
import pandas as pd
import numpy as np
import gzip
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
import json
import os

In [2]:
def load_data():
    train_df = pd.read_csv("../files/input/train_data.csv.zip", index_col=False, compression="zip")
    test_df = pd.read_csv("../files/input/test_data.csv.zip", index_col=False, compression="zip")
    return train_df, test_df

load_data()

(          ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  \
 0      10748     310000    1          3         1   32      0      0      0   
 1      12574      10000    2          3         1   49     -1     -1     -2   
 2      29677      50000    1          2         1   28     -1     -1     -1   
 3       8857      80000    2          3         1   52      2      2      3   
 4      21099     270000    1          1         2   34      1      2      0   
 ...      ...        ...  ...        ...       ...  ...    ...    ...    ...   
 20995  27956     140000    2          2         1   27      2     -1     -1   
 20996  27108     130000    1          2         2   41      0      0      0   
 20997     26      50000    1          3         2   23      0      0      0   
 20998  14778      90000    2          3         2   25      0      0      0   
 20999  20634     120000    1          2         2   31      0      0      0   
 
        PAY_4  ...  BILL_AMT4  BILL_AM

In [3]:
def data_clean(data):
    df = data.copy()
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns="ID", inplace=True)
    df = df[(df["EDUCATION"]!=0) & (df["MARRIAGE"]!=0)]
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: 4 if x>4 else x)
    return df

# load_data() defined elsewhere returns (train_df, test_df) without arguments
train_raw, test_raw = load_data()
df_train = data_clean(train_raw)
df_test = data_clean(test_raw)

df_train, df_test

(       LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
 0         310000    1          3         1   32      0      0      0      0   
 1          10000    2          3         1   49     -1     -1     -2     -1   
 2          50000    1          2         1   28     -1     -1     -1      0   
 3          80000    2          3         1   52      2      2      3      3   
 4         270000    1          1         2   34      1      2      0      0   
 ...          ...  ...        ...       ...  ...    ...    ...    ...    ...   
 20995     140000    2          2         1   27      2     -1     -1     -1   
 20996     130000    1          2         2   41      0      0      0      0   
 20997      50000    1          3         2   23      0      0      0      0   
 20998      90000    2          3         2   25      0      0      0      0   
 20999     120000    1          2         2   31      0      0      0      0   
 
        PAY_5  ...  BILL_AMT4  BILL_AM

In [4]:
def split_data(data_train, data_test):
    x_train = data_train.drop(columns="default")
    y_train = data_train["default"]
    x_test = data_test.drop(columns="default")
    y_test = data_test["default"]
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = split_data(df_train, df_test)

In [8]:
def create_pipeline():
    categorical_cols = ['EDUCATION', 'SEX', 'MARRIAGE']
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
    preprocessor = ColumnTransformer(
        transformers=[('cat', ohe, categorical_cols)],
        remainder='passthrough'  # keep numeric columns as-is
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('scaler', MinMaxScaler()),  # scale numeric + one-hot outputs
        ('feature_selection', SelectKBest(score_func=f_classif)),  # tune k via GridSearchCV
        ('classifier', MLPClassifier(max_iter=500))
    ])
    return pipeline

pipeline = create_pipeline()

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [9]:
def make_grid_search(pipeline):

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid = {
            'feature_selection__k':range(1,11),
            'selectkbest__k': [5, 10, 15],
            'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'mlp__alpha': [0.0001, 0.001, 0.01]
        },
        cv=10,
        scoring='balanced_accuracy',
        n_jobs=-1,
        verbose=2
    )

    return grid_search

grid_search = make_grid_search(pipeline)

grid_search

NameError: name 'pipeline' is not defined

Fitting 10 folds for each of 270 candidates, totalling 2700 fits


ValueError: Invalid parameter 'mlp' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder=MinMaxScaler(),
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['EDUCATION', 'SEX',
                                                   'MARRIAGE'])])),
                ('feature_selection', SelectKBest()),
                ('classifier', 'mlp', MLPClassifier(max_iter=500))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

In [19]:
def save_model(estimator, path):
    os.makedirs(os.path.dirname(path), exist_ok=True) 
    with gzip.open(path, "wb") as f:
        pickle.dump(estimator, f)