In [23]:
import pandas as pd
import gzip
import json
import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix, make_scorer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

In [24]:
TEST_DATA_PATH = '../files/input/test_data.csv.zip'
TRAIN_DATA_PATH = '../files/input/train_data.csv.zip'
MODEL_PATH = '../files/models/model.pkl.gz'
METRICS_PATH = '../files/output/metrics.json'

In [25]:
test_data = pd.read_csv(
	TEST_DATA_PATH,
	index_col=False,
	compression='zip'
)

train_data = pd.read_csv(
	TRAIN_DATA_PATH,
	index_col=False,
	compression='zip'
)

In [26]:
test_data = test_data.rename(columns={"default payment next month": "default"})
train_data = train_data.rename(columns={"default payment next month": "default"})
test_data = test_data.drop(columns=["ID"])
train_data = train_data.drop(columns=["ID"])

In [27]:
train_data = train_data.loc[train_data["MARRIAGE"] != 0]
train_data = train_data.loc[train_data["EDUCATION"] != 0]

test_data = test_data.loc[test_data["MARRIAGE"] != 0]
test_data = test_data.loc[test_data["EDUCATION"] != 0]
test_data["EDUCATION"] = test_data["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
train_data["EDUCATION"] = train_data["EDUCATION"].apply(lambda x: 4 if x > 4 else x)

In [28]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

x_train = train_data.drop(columns=["default"])
y_train = train_data["default"]

x_test = test_data.drop(columns=["default"])
y_test = test_data["default"]

In [29]:

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = list(set(x_train.columns) - set(categorical_features))

preprocessor = ColumnTransformer(
	transformers=[
		("num", StandardScaler(with_mean=True,with_std=True), numerical_features),
		("cat", OneHotEncoder(), categorical_features)
	],
	remainder="passthrough"
)

k_best = SelectKBest(f_classif)
pca = PCA()

model = MLPClassifier(random_state=42)

pipeline = Pipeline(
	steps=[
		("preprocessor", preprocessor),
		("pca", pca),
		("k_best", k_best),
		("model", model)
	]
)

In [30]:
pipeline

In [31]:
param_grid = {
    "k_best__k": [20],
    "model__max_iter": [1000],
    "model__random_state": [17],
    "model__batch_size": ['auto'],
    "model__hidden_layer_sizes": [(50, 30, 40, 60)],
    "model__learning_rate": ['invscaling', 'adaptive', 'constant'],
    "model__learning_rate_init": [0.001],
    "model__alpha": [0.26],
}



grid_search = GridSearchCV(
	pipeline,
	param_grid=param_grid,
	cv=10,
	scoring="balanced_accuracy",
	n_jobs=-1,
	refit=True,
	verbose=1
)

grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


In [32]:
with gzip.open(MODEL_PATH, 'wb') as f:
	pickle.dump(grid_search, f)

FileNotFoundError: [Errno 2] No such file or directory: '../files/models/model.pkl.gz'

In [33]:
metrics = {}

y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)

In [34]:
y_train_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(20953,))

In [35]:
metrics['train'] = {
    'type': 'metrics',
    'dataset': 'train',
	'precision': precision_score(y_train, y_train_pred, zero_division=0),
	'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
	'recall': recall_score(y_train, y_train_pred, zero_division=0),
	'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
}

metrics['test'] = {
    'type': 'metrics',
    'dataset': 'test',
	'precision': precision_score(y_test, y_test_pred, zero_division=0),
	'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
	'recall': recall_score(y_test, y_test_pred, zero_division=0),
	'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
}


# Paso 7.
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
# {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

metrics['train_cm'] = {
    'type': 'cm_matrix',
	'dataset': 'train',
	'true_0': {"predicted_0": int(cm_train[0][0]), "predicted_1": int(cm_train[0][1])},
	'true_1': {"predicted_0": int(cm_train[1][0]), "predicted_1": int(cm_train[1][1])}
}

metrics['test_cm'] = {
    'type': 'cm_matrix',
	'dataset': 'test',
	'true_0': {"predicted_0": int(cm_test[0][0]), "predicted_1": int(cm_test[0][1])},
	'true_1': {"predicted_0": int(cm_test[1][0]), "predicted_1": int(cm_test[1][1])}
}


with open(METRICS_PATH, 'w') as f:
	f.write(json.dumps(metrics['train'])+'\n')
	f.write(json.dumps(metrics['test'])+'\n')
	f.write(json.dumps(metrics['train_cm'])+'\n')
	f.write(json.dumps(metrics['test_cm'])+'\n')