In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import os
import pickle
import gzip
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import StandardScaler
from sklearn.svm import SVC


In [3]:
def load_data():
    
    train_data = pd.read_csv('../files/input/train_data.csv.zip', compression='zip')
    test_data = pd.read_csv('../files/input/test_data.csv.zip', compression='zip')

    return train_data, test_data

train_data, test_data = load_data()

In [4]:
def clean_data(data):
    data = data.copy()
    data.rename(columns = {'default payment next month':'default'}, inplace = True)
    data.dropna(inplace=True)

    data["EDUCATION"] = data["EDUCATION"].apply(lambda x: 4 if x not in [1,2,3,4] else x)
    data.drop(columns=["ID"], inplace=True)

    return data

In [5]:
def make_train_test_split(train_data, test_data):
    
    x_train = train_data.drop(columns=['default'])
    y_train = train_data['default']
    x_test = test_data.drop(columns=['default'])
    y_test = test_data['default']
    return x_train, x_test, y_train, y_test

In [6]:
def make_pipeline(estimator):

    cat = ["SEX", "MARRIAGE", "EDUCATION"]
    transformer = ColumnTransformer(
        transformers=[
            ("ohe", OneHotEncoder(dtype="int"), cat),
        ],
        remainder="passthrough",
    )

    selectkbest = SelectKBest(score_func=f_classif)

    pipeline = Pipeline(
        steps=[
            ("tranformer", transformer),
            ("pca", PCA(n_components=None)),
            ("scaler", StandardScaler()),
            ("selectkbest", selectkbest),
            ("estimator", estimator),
        ],
        verbose=False,
    )

    return pipeline

In [7]:
def make_grid_search(estimator, param_grid, cv=10):

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring="balanced_accuracy",
        n_jobs=-1,
    )

    return grid_search

In [8]:
def save_estimator(estimator):

    model_dir = "../files/models/"
    model_name = os.path.join(model_dir, "model.pkl.gz")
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Guardar nuevo modelo
    with gzip.open(model_name, "wb") as f:
        pickle.dump(estimator, f)

In [9]:
def load_estimator():

    if not os.path.exists("../files/models/model.pkl.gz"):
        return None
    with gzip.open("../files/models/model.pkl.gz", "rb") as file:
        estimator = pickle.load(file)

    return estimator

In [10]:
def train_estimator(estimator):

    from sklearn.metrics import balanced_accuracy_score

    train_data, test_data = load_data()
    train_data = clean_data(train_data)
    test_data = clean_data(test_data)

    x_train, x_test, y_train, y_test = make_train_test_split(train_data, test_data)

    estimator.fit(x_train, y_train)

    best_estimator = load_estimator()
    if best_estimator is not None:

        saved_bal_acc = balanced_accuracy_score(
            y_true=y_test, y_pred=best_estimator.predict(x_test)
        )

        current_bal_acc = balanced_accuracy_score(
            y_true=y_test, y_pred=estimator.predict(x_test)
        )

        if saved_bal_acc > current_bal_acc:
            estimator = best_estimator

    save_estimator(estimator)
    

In [None]:
def train_svm():
    pipeline = make_pipeline(
        estimator=SVC(),
    )

    param_grid = {
        #'selectkbest__score_func': [f_classif, mutual_info_classif],
        'selectkbest__k': ["all"],
        'estimator__kernel': ['rbf'],
        'estimator__C': [10],
        'estimator__gamma': ['scale'],  # only used for rbf
        'estimator__class_weight': [None]
        }


    estimator = make_grid_search(
        estimator=pipeline,
        param_grid=param_grid,
        cv=,
    )

    train_estimator(estimator)


train_svm()


In [22]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

train_data, test_data = load_data()
train_data = clean_data(train_data)
test_data = clean_data(test_data)

x_train, x_test, y_train, y_test = make_train_test_split(train_data, test_data)

param_grid = {
    'selectkbest__k': [15, 20, 25, 30, 'all'],
    'estimator__kernel': ['rbf','sigmoid'],
    'estimator__C': [0.1,1,10],
    'estimator__gamma': ["scale"],
    'estimator__degree': [2, 3, 4, 5]
}

pipeline = make_pipeline(
    SVC()
)

estimator = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    scoring="balanced_accuracy",
    n_jobs=-1,
    verbose=2
)

# ðŸ”¥ You MUST fit before accessing best_params_
estimator.fit(x_train, y_train)

print(estimator.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [12]:
def eval_metrics(
    y_train_true,
    y_test_true,
    y_train_pred,
    y_test_pred,
):

    from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix

    accuracy_train = round(accuracy_score(y_train_true, y_train_pred), 4)
    accuracy_test = round(accuracy_score(y_test_true, y_test_pred), 4)
    balanced_accuracy_train = round(balanced_accuracy_score(y_train_true, y_train_pred), 4)
    balanced_accuracy_test = round(balanced_accuracy_score(y_test_true, y_test_pred), 4)

    recall_train = round(recall_score(y_train_true, y_train_pred), 4)
    recall_test = round(recall_score(y_test_true, y_test_pred), 4)
    f1_train = round(f1_score(y_train_true, y_train_pred), 4)
    f1_test = round(f1_score(y_test_true, y_test_pred), 4)

    confusion_matrix_train = confusion_matrix(y_train_true, y_train_pred)
    confusion_matrix_test = confusion_matrix(y_test_true, y_test_pred)

    metrics_train = {
        "type": "metrics",
        "dataset": "train",
        "precision": accuracy_train,
        "balanced_accuracy": balanced_accuracy_train,
        "recall": recall_train,
        "f1_score": f1_train,
    }

    metrics_test = {
        "type": "metrics",
        "dataset": "test",
        "precision": accuracy_test,
        "balanced_accuracy": balanced_accuracy_test,
        "recall": recall_test,
        "f1_score": f1_test,
    }

    cm_matrix_train = {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0" : {
            "predicted_0": int(confusion_matrix_train[0][0]),
            "predicted_1": int(confusion_matrix_train[0][1]),
        },
        "true_1" : {
            "predicted_0": int(confusion_matrix_train[1][0]),
            "predicted_1": int(confusion_matrix_train[1][1]),
        }
    }

    cm_matrix_test = {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0" : {
            "predicted_0": int(confusion_matrix_test[0][0]),
            "predicted_1": int(confusion_matrix_test[0][1]),
        },
        "true_1" : {
            "predicted_0": int(confusion_matrix_test[1][0]),
            "predicted_1": int(confusion_matrix_test[1][1]),
        }
    }

    return metrics_train, metrics_test, cm_matrix_train, cm_matrix_test


In [13]:
def report(metrics_train, metrics_test, cm_matrix_train, cm_matrix_test):
    import json

    if not os.path.exists("../files/output/"):
        os.makedirs("../files/output/")
    # create the json file if it doesn't exist

    with open("../files/output/metrics.json", "w", encoding="utf-8") as f:
        f.write(json.dumps(metrics_train) + "\n")
        f.write(json.dumps(metrics_test) + "\n")
        f.write(json.dumps(cm_matrix_train) + "\n")
        f.write(json.dumps(cm_matrix_test) + "\n")

train_data, test_data = load_data()
train_data = clean_data(train_data)
test_data = clean_data(test_data)

x_train, x_test, y_train_true, y_test_true = make_train_test_split(train_data, test_data)

metrics_train, metrics_test, cm_matrix_train, cm_matrix_test = eval_metrics(y_train_true, y_test_true, load_estimator().predict(x_train), load_estimator().predict(x_test))
print(metrics_train)
print(metrics_test)
print(cm_matrix_train)
print(cm_matrix_test)

report(metrics_train, metrics_test, cm_matrix_train, cm_matrix_test)

    

{'type': 'metrics', 'dataset': 'train', 'precision': 0.8497, 'balanced_accuracy': 0.7049, 'recall': 0.4417, 'f1_score': 0.5695}
{'type': 'metrics', 'dataset': 'test', 'precision': 0.8234, 'balanced_accuracy': 0.6521, 'recall': 0.3546, 'f1_score': 0.4601}
{'type': 'cm_matrix', 'dataset': 'train', 'true_0': {'predicted_0': 15755, 'predicted_1': 518}, 'true_1': {'predicted_0': 2639, 'predicted_1': 2088}}
{'type': 'cm_matrix', 'dataset': 'test', 'true_0': {'predicted_0': 6734, 'predicted_1': 357}, 'true_1': {'predicted_0': 1232, 'predicted_1': 677}}


In [14]:
train_data, test_data = load_data()
train_data = clean_data(train_data)
test_data = clean_data(test_data)
x_train, x_test, y_train, y_test = make_train_test_split(train_data, test_data)


print(load_estimator().score(x_test,y_test))

0.652145213326804
