# Avaliação - Regressão usando MLP
## Inferir custos médicos do plano de saúde a partir do arquivo insurance.csv

### Carregamento dos dados

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

In [None]:
insurance = pd.read_csv(Path('insurance.csv'))
insurance.head()

In [None]:
insurance.info()

### Preparação dos dados

In [None]:
#função para reverter o pandas.get_dummies()
def undummify(df, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [None]:
one_hot_encoding = pd.get_dummies(insurance[['sex', 'region', 'smoker']])
one_hot_encoding.head()

In [None]:
insurance = insurance.drop(['sex', 'region', 'smoker'], axis = 1)

In [None]:
insurance = pd.concat([insurance, one_hot_encoding], axis=1)
insurance.info()

In [None]:
insurance.corr()['charges'].sort_values()

In [None]:
train_set, test_set = train_test_split(insurance, test_size=0.2)
print(len(train_set), "train +", len(test_set), "test")

In [None]:
y_train, y_test = train_set['charges'], test_set['charges']

In [None]:
x_train, x_test = train_set.drop(['charges'], axis=1), test_set.drop(['charges'], axis=1)
x_train.head()

### Treinamento e avaliação

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def plot_learning_curves_random_forest(X, y, max_depth):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_errors, val_errors = [], []
    for max in range(1, max_depth):
        model = RandomForestRegressor(n_estimators=500, random_state=42, max_depth=max, n_jobs=-1)
        model.fit(X_train, y_train)
        y_train_predict = model.predict(X_train)
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train_predict, y_train))
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    plt.xlabel("Profundidade máxima das árvore")
    plt.ylabel("RMSE")
    plt.plot(np.sqrt(train_errors), "r-", linewidth=2, label="Conjunto de treinamento")  
    plt.plot(np.sqrt(val_errors), "b-", linewidth=2, label="Conjunto de validação")
    plt.legend()

plot_learning_curves_random_forest(pd.concat([x_train, x_test]), pd.concat([y_train, y_test]), 10)

In [None]:
regressor = RandomForestRegressor(n_estimators=500, max_depth=4, random_state=42, n_jobs=-1)
regressor.fit(x_train, y_train)
resultado_random_forest_train = regressor.predict(x_train)
resultado_random_forest_test = regressor.predict(x_test)

In [None]:
#K-fold cross validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=3)
resultados_mse_train = []
resultados_mae_train = []
resultados_r2_train = []
resultados_mse_test = []
resultados_mae_test = []
resultados_r2_test = []
x_full = pd.concat([x_train, x_test])
y_full = pd.concat([y_train, y_test])

for train_index, test_index in kf.split(x_full):
    x_train_fold, x_test_fold = x_full.iloc[train_index], x_full.iloc[test_index]
    y_train_fold, y_test_fold = y_full.iloc[train_index], y_full.iloc[test_index]

    
    regressor.fit(x_train_fold, y_train_fold)
    resultado_random_forest_train_k = regressor.predict(x_train_fold)
    resultado_random_forest_test_k = regressor.predict(x_test_fold)
    resultados_mse_train.append(mean_squared_error(y_train_fold, resultado_random_forest_train_k))
    resultados_mae_train.append(mean_absolute_error(y_train_fold, resultado_random_forest_train_k))
    resultados_r2_train.append(r2_score(y_train_fold, resultado_random_forest_train_k))
    resultados_mse_test.append(mean_squared_error(y_test_fold, resultado_random_forest_test_k))
    resultados_mae_test.append(mean_absolute_error(y_test_fold, resultado_random_forest_test_k))
    resultados_r2_test.append(r2_score(y_test_fold, resultado_random_forest_test_k))
    

In [None]:
from statistics import mean

print('MSE, MAE E R2 NO TREINO - RANDOM FOREST')
print(mean_squared_error(y_train, resultado_random_forest_train))
print(mean_absolute_error(y_train, resultado_random_forest_train))
print(r2_score(y_train, resultado_random_forest_train))

print('MSE, MAE E R2 NO TESTE - RANDOM FOREST')
print(mean_squared_error(y_test, resultado_random_forest_test))
print(mean_absolute_error(y_test, resultado_random_forest_test))
print(r2_score(y_test, resultado_random_forest_test))

print('MSE, MAE E R2 COM K_FOLD_CROSS_VALIDATION NO TREINO - RANDOM FOREST')
print(mean(resultados_mse_train))
print(mean(resultados_mae_train))
print(mean(resultados_r2_train))

print('MSE, MAE E R2 COM K_FOLD_CROSS_VALIDATION NO TESTE - RANDOM FOREST')
print(mean(resultados_mse_test))
print(mean(resultados_mae_test))
print(mean(resultados_r2_test))

### Rede Neural com TensorFlow e Keras

In [None]:
import tensorflow as tf
from tensorflow import keras

model = keras.models.Sequential([
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42), input_shape=[11]),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(12, activation="elu", kernel_initializer=keras.initializers.HeNormal(seed=42)),
keras.layers.Dropout(rate=0.10),
keras.layers.Dense(1)
])

In [None]:
keras.utils.plot_model(model, to_file = "model.png")

In [None]:
#Depois de construir o modelo preciso compilar
#Na fase de compilação, define-se, por exemplo
#a função de perda, o otimizador, as métricas para serem calculadas  durante o treinamento e validação e etc

model.compile(loss="mae",
optimizer="adam")

In [None]:
#learning scheduler
def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1**(epoch / s)
    return exponential_decay_fn

def power_decay(lr0, s, c=1):
    def power_decay_fn(epoch):
        return lr0 / (1 + (epoch/s))**c
    return power_decay_fn

exponential_decay = exponential_decay(0.1, 20)
power_decay = power_decay(0.1, 20)
learning_rate_scheduler = keras.callbacks.LearningRateScheduler(power_decay, verbose=1)

In [None]:
#K-fold cross validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=3)
resultados_mse_train = []
resultados_mae_train = []
resultados_r2_train = []
resultados_mse_test = []
resultados_mae_test = []
resultados_r2_test = []
resultados_history = []
x_full = pd.concat([x_train, x_test])
y_full = pd.concat([y_train, y_test])

for train_index, test_index in kf.split(x_full):
    x_train_fold, x_test_fold = x_full.iloc[train_index], x_full.iloc[test_index]
    y_train_fold, y_test_fold = y_full.iloc[train_index], y_full.iloc[test_index]

    resultados_history.append(model.fit(x_train_fold, y_train_fold, epochs=500, validation_data=(x_test_fold, y_test_fold),
    callbacks=[keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)]))
    resultado_mlp_train = model.predict(x_train_fold)
    resultado_mlp_test = model.predict(x_test_fold)
    resultados_mse_train.append(mean_squared_error(y_train_fold, resultado_mlp_train))
    resultados_mae_train.append(mean_absolute_error(y_train_fold, resultado_mlp_train))
    resultados_r2_train.append(r2_score(y_train_fold, resultado_mlp_train))
    resultados_mse_test.append(mean_squared_error(y_test_fold, resultado_mlp_test))
    resultados_mae_test.append(mean_absolute_error(y_test_fold, resultado_mlp_test))
    resultados_r2_test.append(r2_score(y_test_fold, resultado_mlp_test))
    

In [None]:
history = model.fit(x_train, y_train, epochs=500, validation_data=(x_test, y_test),
callbacks=[keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)])
#callbacks=[keras.callbacks.EarlyStopping(patience=25), learning_rate_scheduler])

In [None]:
from statistics import mean
resultado_mlp = model.predict(x_train)
resultado_mlp2 = model.predict(x_test)

print('MSE, MAE E R2 NO TREINO - MLP')
print(mean_squared_error(y_train, resultado_mlp))
print(mean_absolute_error(y_train, resultado_mlp))
print(r2_score(y_train, resultado_mlp))

print('MSE, MAE E R2 NO TESTE - MLP')
print(mean_squared_error(y_test, resultado_mlp2))
print(mean_absolute_error(y_test, resultado_mlp2))
print(r2_score(y_test, resultado_mlp2))

print('MSE, MAE E R2 COM MLP COM K_FOLD_CROSS_VALIDATION NO TREINO - MLP')
print(mean(resultados_mse_train))
print(mean(resultados_mae_train))
print(mean(resultados_r2_train))

print('MSE, MAE E R2 COM MLP COM K_FOLD_CROSS_VALIDATION NO TESTE - MLP')
print(mean(resultados_mse_test))
print(mean(resultados_mae_test))
print(mean(resultados_r2_test))

In [None]:
#Posso fazer curvas de aprendizado com o histórico do treinamento da rede neural
#history.history.pop('lr')
pd.DataFrame(history.history).plot(figsize=(10, 6))
plt.grid(True)
plt.title("Sem K-Fold")
plt.xlabel("Épocas")
plt.ylabel("MAE")
plt.gca()
plt.show()

In [None]:
#define number of rows and columns for subplots
nrow=3
ncol=1

# make a list of all dataframes 
df_list = [pd.DataFrame(resultados_history[0].history),
pd.DataFrame(resultados_history[1].history), pd.DataFrame(resultados_history[2].history)]
fig, axes = plt.subplots(nrow, ncol, sharey=True)
plt.close()
# plot counter
count=0
for r in range(nrow):
    for c in range(ncol):
        df_list[count].plot(figsize=(10, 6))
        plt.grid(True)
        plt.title(f"Com K-Fold, Split {count}")
        plt.xlabel("Épocas")
        plt.ylabel("MAE")
        plt.gca()
        plt.show()
        count+=1

In [None]:
def build_model(n_hidden=12, n_neurons=12, learning_rate=3e-3, input_shape=[11], optimizer = "nadam"):
    model = keras.models.Sequential()
    options = {"input_shape": input_shape}
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="elu", **options))
        options = {}
    model.add(keras.layers.Dense(1, **options))
    model.compile(loss="mse", optimizer=optimizer)
    return model


In [None]:
from scikeras.wrappers import KerasRegressor

#keras_reg = KerasRegressor(build_model, n_hidden=12, n_neurons=12, learning_rate=3e-3, input_shape=[11])
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
#history = keras_reg.fit(x_train, y_train, epochs=300, validation_data=(x_test, y_test))

In [None]:
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
param_distribs = {
"n_hidden": np.arange(1, 15),
"n_neurons": np.arange(1, 15),
"optimizer": ["Adagrad", "RMSprop", "Adam", "Nadam", "Adamax"],
#"learning_rate": reciprocal(3e-4, 3e-2),
#"learning_rate": reciprocal(3e-4, 3e-2).rvs(1000).tolist(),
#"learning_rate": [0.001, 0.01, 0.1],
}
rnd_search_cv = GridSearchCV(keras_reg, param_distribs, cv=3, n_jobs=-1, scoring="r2")
rnd_search_cv.fit(x_train, y_train, epochs=500,
validation_data=(x_test, y_test),
callbacks=[keras.callbacks.EarlyStopping(patience=20, verbose=1, restore_best_weights=True)], verbose=0)

In [None]:
print(f"Melhores parâmetros: {rnd_search_cv.best_params_}")
print(f"Melhor score: {rnd_search_cv.best_score_}")
model = rnd_search_cv.best_estimator_.model

In [None]:
resultado_mlp = model.predict(x_train)
resultado_mlp2 = model.predict(x_test)

print('MSE, MAE E R2 NO TREINO - MLP GRID SEARCH')
print(mean_squared_error(y_train, resultado_mlp))
print(mean_absolute_error(y_train, resultado_mlp))
print(r2_score(y_train, resultado_mlp))

resultado_mlp = model.predict(x_test)

print('MSE, MAE E R2 NO TESTE - MLP GRID SEARCH')
print(mean_squared_error(y_test, resultado_mlp2))
print(mean_absolute_error(y_test, resultado_mlp2))
print(r2_score(y_test, resultado_mlp2))