In [112]:
import math
import pandas as pd
import numpy as np
from collections import Counter
from random import seed
import collections
import imblearn

# Machine learning models 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Model evaluation and hyperparameter tuning
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from scipy import stats

# Constant features
from fast_ml.utilities import display_all
from fast_ml.feature_selection import get_constant_features

# Data visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Feature selection methods
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler


In [113]:
np.random.seed(21)
path_save = './Output/'
df = pd.read_csv(path_save + 'finaldataset_for_ML.csv', encoding='latin-1')
df2 = pd.read_csv(path_save + 'finaldataset_for_ML2.csv', encoding='latin-1')

In [114]:
"""
Target "0": projetos encerrados / terminados
Target "1: projetos anulados
"""
def create_target_column(row):
    if(row['Terminado'] == 1.0):
        return 0
    elif(row['Anulado'] == 1.0):
        return 1
    else:
        print("Existe um registo sem target")
        return None

In [115]:
df['Target'] = df.apply(lambda x: create_target_column(x), axis=1)
df2['Target'] = df2.apply(lambda x: create_target_column(x), axis=1)

In [116]:
clean_df = df.replace(np.nan, 0.0) #Replace nulls for 0
clean_df.replace([np.inf, -np.inf], 0.0, inplace=True) #Replace infinites for 0

clean_df2 = df2.replace(np.nan, 0.0) #Replace nulls for 0
clean_df2.replace([np.inf, -np.inf], 0.0, inplace=True) #Replace infinites for 0

In [117]:
constant_features = get_constant_features(clean_df)
#constant_features

constant_features['Var'] #Shows features that have constant values

0     resultado_das_atividades_descontinuadas
1                          IAE_CMVMC_ACT_BIOL
2                                        rank
3                        Uploads/Aplicavel_12
4                      3_IMG_EC_RS_SERSOCIAIS
                       ...                   
66                     Incentivo/Cap_Proprios
67                      Incentivo/Dispensa_Ic
68                          Paramproj/Param_1
69                             Resumo/Icep_75
70                         Impactoemp/Impacto
Name: Var, Length: 71, dtype: object

In [118]:
constant_features_list = constant_features['Var'].tolist()
constant_features_list.append('N_Proj_anon')
constant_features_list.append('CAE_SUBCLASSE')
constant_features_list.append('DATA_RECEPCAO')
constant_features_list.append('NIF_anon')
constant_features_list.append('Terminado')
constant_features_list.append('Anulado')
constant_features_list.append('Nproj_anon_x')
constant_features_list.append('Nproj_anon_y')
constant_features_list.append('ANO_EXERCICIO')
constant_features_list.append('ANO_EXERCICIO_VALIDOS')
constant_features_list.append('Parametros/Ano_Cand')
print('Existem %i colunas constantes' % len(constant_features_list))

Existem 82 colunas constantes


In [119]:
clean_df = clean_df.drop(constant_features_list, axis=1)
clean_df2 = clean_df2.drop(constant_features_list, axis=1)

In [120]:
object_columns = clean_df.select_dtypes(include='object').columns
object_columns_list = object_columns.tolist()
object_columns_list.remove("Motivo")

In [121]:
clean_df = clean_df.drop(columns=object_columns_list)
clean_df2 = clean_df2.drop(columns=object_columns_list)

In [122]:
projetos_encerrados = clean_df.loc[clean_df['Target'] == 0]
projetos_anulados_motivos2 = clean_df.loc[(clean_df['Target'] == 1) & (clean_df['Motivo'] != "Desistência do promotor")]
clean_df4 = pd.concat([projetos_encerrados, projetos_anulados_motivos2])

In [123]:
X = clean_df.drop(columns=['Target', 'Motivo'])
y = clean_df['Target']
scaler = MinMaxScaler() #MinMaxScaler
X_scaled = pd.DataFrame(scaler.fit_transform(X.values), columns= X.columns, index=X.index)
X3 = clean_df2.drop(columns=['Target', 'Motivo'])
y3 = clean_df2['Target']
X4 = clean_df4.drop(columns=['Target', 'Motivo'])
y4 = clean_df4['Target']

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, train_size=0.7, test_size=0.3)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, random_state=21, train_size=0.7, test_size=0.3)
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, random_state=21, train_size=0.7, test_size=0.3)

In [125]:
df_features_list = pd.read_csv(path_save + "table_features_experience_oficial.csv", encoding="latin-1")
display(df_features_list)

Unnamed: 0,Experience,Count,Columns
0,feat1,30,"RES_ANTES_DEPRECIACAO_GASTOS, GASTOS_DEPRECIAC..."
1,feat2,30,"GASTOS_PESSOAL, OUTROS_REDIMENTOS_GANHOS, RES_..."
2,feat3,50,"GASTOS_PESSOAL, RES_ANTES_DEPRECIACAO_GASTOS, ..."
3,feat4,50,"VENDAS_SERVICOS_PRESTADOS, GASTOS_PESSOAL, OUT..."
4,feat5,50,"GASTOS_DEPRECIACAO_AMORTIZA, Incentivo/Tx_Limi..."
5,feat6,50,"RES_ANTES_DEPRECIACAO_GASTOS, GASTOS_DEPRECIAC..."
6,feat7,50,"Txtfinanc/Fonte, Promotor/Nat_Jur, 3_IMG_EC_PR..."
7,allFeat,267,"VENDAS_SERVICOS_PRESTADOS, SUBSIDIOS_EXPLORACA..."
8,manualFeat,22,"EBITDA, EBIT, Total_Assets, Total_Liabilities,..."


In [126]:
len(df_features_list.loc[df_features_list['Experience'] == "manualFeat", ["Columns"]].values.tolist()[0][0].split(","))

22

In [127]:
feat5 = df_features_list.loc[df_features_list['Experience'] == "feat5", ["Columns"]].values.tolist()[0][0].split(",")
feat5 = [s.strip() for s in feat5]
feat7 = df_features_list.loc[df_features_list['Experience'] == "feat7", ["Columns"]].values.tolist()[0][0].split(",")
feat7 = [s.strip() for s in feat7]
manualFeat = df_features_list.loc[df_features_list['Experience'] == "manualFeat", ["Columns"]].values.tolist()[0][0].split(",")
manualFeat = [s.strip() for s in manualFeat]
feat5_new = feat5 + manualFeat
feat7_new = feat7 + manualFeat
feat5_new = list(set(feat5))
feat7_new = list(set(feat7))

In [128]:
feat5

['GASTOS_DEPRECIACAO_AMORTIZA',
 'Incentivo/Tx_Limite',
 'Operating cash flow current liabilities',
 'Dadosprojecto/N_Meses',
 'SUBSIDIOS_EXPLORACAO',
 'ATIVO_COR_ESTADO_OUT_ENTES_PUB',
 'PASSIVO_COR_OUT_CONTAS_A_PAGAR',
 'GP_REMUN_ORGAOS_SOCIAIS',
 'IAE_VAR_INVENT_PROD',
 'ATIVO_COR_DIFERIMENTOS',
 'Resumo/Cae',
 'IAE_VENDAS_MERCADORIAS',
 '2_IMG_COM_VENDAS',
 'total debt / total assets',
 'PASSIVO_NC_FINANCIAMENTOS_OBTD',
 'ATIVO_NCOR_INV_FINANC_PQ_ENTID',
 'Resumo/Nute_Norte',
 '3_IMG_EC_COMPRAS',
 'Growth_Rate_Net_Sales_T3',
 'NIF_Prom_anon',
 'GP_SEG_ACID_TRAB_DOEN_PROF',
 'IAE_AFT_QUANT_ESCR_LIQ_FIN',
 'earnings before tax and interest / total asset',
 'ATIVO_NCOR_PART_FINAN_EQV_PAT',
 'CP_OUTRAS_VARIACAOES_CAP_PRO',
 'Resumo/Investimento',
 'IAE_AFT_TOTAL_AQUIS_EDIF',
 'CP_RESERVAS_LEGAIS',
 'Growth_Rate_Net_Sales_T2',
 '1_IMG_INT_FORN_SEREXTERN',
 'PASSIVO_COR_ESTADO_OUT_ENT_PUB',
 'IAE_PREST_SERV',
 'IMPOSTO_RENDIMENTO_PERIODO',
 'ATIVO_COR_ACCIONISTAS_SOCIOS',
 'Inventory_Tur

In [129]:
feat7

['Txtfinanc/Fonte',
 'Promotor/Nat_Jur',
 '3_IMG_EC_PREST_SERV',
 'GASTOS_DEPRECIACAO_AMORTIZA',
 'RES_ANTES_DEPRECIACAO_GASTOS',
 'ATIVO_NCOR_PART_FINAN_EQV_PAT',
 'Growth_Rate_Net_Sales_T3',
 'ATIVO_NCOR_FIXOS_TANGIVEIS',
 'ATIVO_NCOR_INV_FINANC_PQ_ENTID',
 'Resumo/Lst_Po',
 '1_IMG_INT_AQUIS_ACT_INTANG',
 'PROVISOES',
 'CP_AJUST_EM_ACT_FINANCEIROS',
 'PASSIVO_NC_FINANCIAMENTOS_OBTD',
 'ATIVO_COR_INVENTARIOS',
 'ATIVO_COR_ACCIONISTAS_SOCIOS',
 '3_IMG_EC_AQUIS_ACT_FIX_TANG',
 '2_N_PESSOAL_NHT_PSETP_REMUNERADAS',
 'Resumo/Nute_Norte',
 '2_IMG_COM_AQUIS_ACT_FIX_TANG',
 '2_PESSOAL_NHT_PSE_TEMPO_PARCIAL',
 'Critselb1/N_Mercados',
 'Growth_Rate_Total_Assets_T3',
 'CP_TOTAL',
 'GP_REMUN_ORGAOS_SOCIAIS',
 '3_IMG_EC_COMPRAS',
 'Growth_Rate_Net_Sales_T2',
 'Growth_Rate_Net_Sales_T1',
 'Operating cash flow current liabilities',
 'IAE_CMVMC_MATER_PRIMAS',
 'Resumo/Investimento',
 '1_PESSOAL_NMP_PSE_MULHERES',
 'ATIVO_NCOR_PROPRI_INVESTIMENTO',
 'ATIVO_COR_CLIENTES',
 'Analisemercados/Direcao',
 '

In [130]:
manualFeat

['EBITDA',
 'EBIT',
 'Total_Assets',
 'Total_Liabilities',
 'total_assets_to_total_liabilities',
 'Working capital divided by total assets',
 'Gross income divided by sales',
 'total debt / total assets',
 'earnings before tax and interest / total asset',
 'Operating cash flow current liabilities',
 'Accounts_Receivables_Turnover',
 'Creditors_Turnover',
 'Inventory_Turnover',
 'Average_Collection_Period_For_Receivables',
 'Average_Payment_Period_To_Creditors',
 'Average_Turnover_Period_For_Inventories',
 'Growth_Rate_Net_Sales_T1',
 'Growth_Rate_Net_Sales_T2',
 'Growth_Rate_Net_Sales_T3',
 'Growth_Rate_Total_Assets_T1',
 'Growth_Rate_Total_Assets_T2',
 'Growth_Rate_Total_Assets_T3']

## Cenario 1

In [131]:
pipe = imblearnPipeline([('StandardScaler', StandardScaler()), ('over', SMOTE(random_state=21)), ("SupportVectorClassification", SVC(random_state=21))])
X_train_balanced, y_train_balanced = pipe['over'].fit_resample(X_train[feat5_new], y_train)
pipe.fit(X_train_balanced, y_train_balanced)
# Predict train dataset
print("#### TRAIN ####")
y_pred = pipe.predict(X_train_balanced)
print("Accuracy: %.2f" % (accuracy_score(y_train_balanced, y_pred)))
print("F1: %.2f" %(f1_score(y_train_balanced, y_pred)))
print("Precision: %.2f" %(precision_score(y_train_balanced, y_pred)))
print("Recall: %.2f" %(recall_score(y_train_balanced, y_pred)))
print("ROC_AUC: %.2f" %(roc_auc_score(y_train_balanced, y_pred)))

# Predict test dataset
print("#### TEST ####")
y_pred = pipe.predict(X_test[feat5_new])
print("Accuracy: %.2f" %(accuracy_score(y_test, y_pred)))
print("F1: %.2f" %(f1_score(y_test, y_pred)))
print("Precision: %.2f" %(precision_score(y_test, y_pred)))
print("Recall: %.2f" %(recall_score(y_test, y_pred)))
print("ROC_AUC: %.2f" %(roc_auc_score(y_test, y_pred)))


#### TRAIN ####
Accuracy: 0.79
F1: 0.79
Precision: 0.77
Recall: 0.82
ROC_AUC: 0.79
#### TEST ####
Accuracy: 0.72
F1: 0.66
Precision: 0.63
Recall: 0.70
ROC_AUC: 0.72


In [132]:
pipe = imblearnPipeline([('StandardScaler', StandardScaler()), ('under', RandomUnderSampler(random_state=21)), ('over', SMOTE(random_state=21)), ("SupportVectorClassification", SVC(random_state=21))])
X_train_balanced, y_train_balanced = pipe['over'].fit_resample(X_train[feat5_new], y_train)
pipe.fit(X_train_balanced, y_train_balanced)
# Predict train dataset
print("#### TRAIN ####")
y_pred = pipe.predict(X_train_balanced)
print("Accuracy: %.2f" % (accuracy_score(y_train_balanced, y_pred)))
print("F1: %.2f" %(f1_score(y_train_balanced, y_pred)))
print("Precision: %.2f" %(precision_score(y_train_balanced, y_pred)))
print("Recall: %.2f" %(recall_score(y_train_balanced, y_pred)))
print("ROC_AUC: %.2f" %(roc_auc_score(y_train_balanced, y_pred)))

# Predict test dataset
print("#### TEST ####")
y_pred = pipe.predict(X_test[feat5_new])
print("Accuracy: %.2f" %(accuracy_score(y_test, y_pred)))
print("F1: %.2f" %(f1_score(y_test, y_pred)))
print("Precision: %.2f" %(precision_score(y_test, y_pred)))
print("Recall: %.2f" %(recall_score(y_test, y_pred)))
print("ROC_AUC: %.2f" %(roc_auc_score(y_test, y_pred)))


#### TRAIN ####
Accuracy: 0.79
F1: 0.79
Precision: 0.77
Recall: 0.82
ROC_AUC: 0.79
#### TEST ####
Accuracy: 0.72
F1: 0.66
Precision: 0.63
Recall: 0.70
ROC_AUC: 0.72


In [133]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_score': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'roc_auc': make_scorer(roc_auc_score)
}

In [134]:
from sklearn.metrics import make_scorer, f1_score, accuracy_score

pipeline = imblearnPipeline([('StandardScaler', StandardScaler()), ('resampler', SMOTE(random_state=21)), ("classifier", SVC(random_state=21))])

# Define hyperparameters for resampler and classifier
param_grid = {
    'resampler__sampling_strategy': ['auto', 0.5, 0.75],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# Create custom scorers for F1 score and accuracy
#f1_scorer = make_scorer(f1_score, average='weighted')
#accuracy_scorer = make_scorer(accuracy_score)

# Grid Search with multiple scoring metrics
#scoring = {'F1': f1_scorer, 'Accuracy': accuracy_scorer}

# Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scoring, refit='f1_score')
grid_search.fit(X_train[feat5], y_train)

# Get the best estimator and best hyperparameters
best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_

# Access the results including all metrics
results = grid_search.cv_results_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Print the metrics for each combination of hyperparameters
for metric in ['accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']:
    mean_metric_values = np.nanmean(results[f'mean_test_{metric}']) #results[f'mean_test_{metric}']
    print(f"Mean {metric.capitalize()}: {mean_metric_values.mean():.4f}")

y_pred = best_estimator.predict(X_test[feat5])
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
precision = precision_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
recall = recall_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
roc_auc = roc_auc_score(y_test, y_pred)  # ROC AUC is for binary classification
print("Best estimator metrics")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC:", roc_auc)

In [None]:
from sklearn.metrics import make_scorer, f1_score, accuracy_score

pipeline = imblearnPipeline([('StandardScaler', StandardScaler()), ('undersampler', RandomUnderSampler(random_state=21)), ('resampler', SMOTE(random_state=21)), ("classifier", SVC(random_state=21))])

# Define hyperparameters for resampler and classifier
param_grid = {
    'resampler__sampling_strategy': ['auto', 0.5, 0.75],
    'undersampler__sampling_strategy': ['auto', 0.5, 0.75],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# Create custom scorers for F1 score and accuracy
#f1_scorer = make_scorer(f1_score, average='weighted')
#accuracy_scorer = make_scorer(accuracy_score)

# Grid Search with multiple scoring metrics
#scoring = {'F1': f1_scorer, 'Accuracy': accuracy_scorer}

# Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scoring, refit='f1_score')
grid_search.fit(X_train[feat5], y_train)

# Get the best estimator and best hyperparameters
best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_

# Access the results including all metrics
results = grid_search.cv_results_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Print the metrics for each combination of hyperparameters
for metric in ['accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']:
    mean_metric_values = np.nanmean(results[f'mean_test_{metric}']) #results[f'mean_test_{metric}']
    print(f"Mean {metric.capitalize()}: {mean_metric_values.mean():.4f}")

y_pred = best_estimator.predict(X_test[feat5])
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
precision = precision_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
recall = recall_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
roc_auc = roc_auc_score(y_test, y_pred)  # ROC AUC is for binary classification
print("Best estimator metrics")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC:", roc_auc)

Best Hyperparameters: {'classifier__C': 10, 'classifier__kernel': 'rbf', 'resampler__sampling_strategy': 'auto', 'undersampler__sampling_strategy': 'auto'}
Mean Accuracy: 0.6414
Mean F1_score: 0.5542
Mean Precision: 0.4964
Mean Recall: 0.6356
Mean Roc_auc: 0.6402
Best estimator metrics
Accuracy: 0.6722222222222223
F1 Score: 0.6760398824909428
Precision: 0.6998944658944659
Recall: 0.6722222222222223
ROC AUC: 0.6838628193918197


210 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\afons\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\afons\AppData\Local\Programs\Python\Python311\Lib\site-packages\imblearn\pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\afons\AppData\Local\Programs\Python\Python311\Lib\site-packages\imblearn\pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample

## Cenario 2.b

In [None]:
pipe = Pipeline([('StandardScaler', StandardScaler()), ('pca', PCA()), ("MLP", MLPClassifier(random_state=21))])
#X_train_balanced, y_train_balanced = pipe['over'].fit_resample(X_train[feat5_new], y_train)
pipe.fit(X_train4[feat5_new], y_train4)
# Predict train dataset
print("#### TRAIN ####")
y_pred = pipe.predict(y_train4)
print("Accuracy: %.2f" % (accuracy_score(y_train4, y_pred)))
print("F1: %.2f" %(f1_score(y_train4, y_pred)))
print("Precision: %.2f" %(precision_score(y_train4, y_pred)))
print("Recall: %.2f" %(recall_score(y_train4, y_pred)))
print("ROC_AUC: %.2f" %(roc_auc_score(y_train4, y_pred)))

# Predict test dataset
print("#### TEST ####")
y_pred = pipe.predict(X_test4[feat5_new])
print("Accuracy: %.2f" %(accuracy_score(y_test4, y_pred)))
print("F1: %.2f" %(f1_score(y_test4, y_pred)))
print("Precision: %.2f" %(precision_score(y_test4, y_pred)))
print("Recall: %.2f" %(recall_score(y_test4, y_pred)))
print("ROC_AUC: %.2f" %(roc_auc_score(y_test4, y_pred)))


In [None]:
from sklearn.metrics import make_scorer, f1_score, accuracy_score

pipe = Pipeline([('StandardScaler', StandardScaler()), ('pca', PCA()), ("MLP", MLPClassifier(random_state=21))])

# Define hyperparameters for resampler and classifier
param_grid = {
    'hidden_layer_sizes': [(50,), (100, 50), (100, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [100, 200, 300],
    'early_stopping': [True, False],
    'batch_size': [32, 64, 128],
    'tol': [1e-4, 1e-3, 1e-2]
}

# Create custom scorers for F1 score and accuracy
#f1_scorer = make_scorer(f1_score, average='weighted')
#accuracy_scorer = make_scorer(accuracy_score)

# Grid Search with multiple scoring metrics
#scoring = {'F1': f1_scorer, 'Accuracy': accuracy_scorer}

# Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scoring, refit='f1_score')
grid_search.fit(X_train[feat5], y_train)

# Get the best estimator and best hyperparameters
best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_

# Access the results including all metrics
results = grid_search.cv_results_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Print the metrics for each combination of hyperparameters
for metric in ['accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']:
    mean_metric_values = np.nanmean(results[f'mean_test_{metric}']) #results[f'mean_test_{metric}']
    print(f"Mean {metric.capitalize()}: {mean_metric_values.mean():.4f}")

y_pred = best_estimator.predict(X_test[feat5])
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
precision = precision_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
recall = recall_score(y_test, y_pred, average='weighted')  # Adjust 'average' as needed
roc_auc = roc_auc_score(y_test, y_pred)  # ROC AUC is for binary classification
print("Best estimator metrics")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC:", roc_auc)

ValueError: Invalid parameter 'activation' for estimator Pipeline(steps=[('StandardScaler', StandardScaler()),
                ('undersampler', RandomUnderSampler(random_state=21)),
                ('resampler', SMOTE(random_state=21)),
                ('classifier', RandomForestClassifier(random_state=21))]). Valid parameters are: ['memory', 'steps', 'verbose'].

## Cenario 3

In [None]:
pipe = imblearnPipeline([('StandardScaler', StandardScaler()), ('under', RandomUnderSampler(random_state=21)), ('over', SMOTE(random_state=21)), ("RandomForest", RandomForestClassifier(random_state=21))])
X_train_balanced, y_train_balanced = pipe['over'].fit_resample(X_train3[feat7], y_train3)
pipe.fit(X_train_balanced, y_train_balanced)
# Predict train dataset
print("#### TRAIN ####")
y_pred = pipe.predict(X_train_balanced)
print("Accuracy: %.2f" % (accuracy_score(y_train_balanced, y_pred)))
print("F1: %.2f" %(f1_score(y_train_balanced, y_pred)))
print("Precision: %.2f" %(precision_score(y_train_balanced, y_pred)))
print("Recall: %.2f" %(recall_score(y_train_balanced, y_pred)))
print("ROC_AUC: %.2f" %(roc_auc_score(y_train_balanced, y_pred)))

# Predict test dataset
print("#### TEST ####")
y_pred = pipe.predict(X_test3[feat7])
print("Accuracy: %.2f" %(accuracy_score(y_test3, y_pred)))
print("F1: %.2f" %(f1_score(y_test3, y_pred)))
print("Precision: %.2f" %(precision_score(y_test3, y_pred)))
print("Recall: %.2f" %(recall_score(y_test3, y_pred)))
print("ROC_AUC: %.2f" %(roc_auc_score(y_test3, y_pred)))


#### TRAIN ####
Accuracy: 0.98
F1: 0.98
Precision: 0.96
Recall: 1.00
ROC_AUC: 0.98
#### TEST ####
Accuracy: 0.72
F1: 0.64
Precision: 0.73
Recall: 0.58
ROC_AUC: 0.70


In [None]:
from sklearn.metrics import make_scorer, f1_score, accuracy_score

pipeline = imblearnPipeline([('StandardScaler', StandardScaler()), ('undersampler', RandomUnderSampler(random_state=21)), ('resampler', SMOTE(random_state=21)), ("classifier", RandomForestClassifier(random_state=21))])

# Define hyperparameters for resampler and classifier
param_grid = {
    'resampler__sampling_strategy': ['auto', 0.5, 0.75],
    'undersampler__sampling_strategy': ['auto', 0.5, 0.75],
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

# Create custom scorers for F1 score and accuracy
#f1_scorer = make_scorer(f1_score, average='weighted')
#accuracy_scorer = make_scorer(accuracy_score)

# Grid Search with multiple scoring metrics
#scoring = {'F1': f1_scorer, 'Accuracy': accuracy_scorer}

# Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scoring, refit='f1_score')
grid_search.fit(X_train3[feat7], y_train3)

# Get the best estimator and best hyperparameters
best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_

# Access the results including all metrics
results = grid_search.cv_results_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Print the metrics for each combination of hyperparameters
for metric in ['accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']:
    mean_metric_values = np.nanmean(results[f'mean_test_{metric}']) #results[f'mean_test_{metric}']
    print(f"Mean {metric.capitalize()}: {mean_metric_values.mean():.4f}")

y_pred = best_estimator.predict(X_test3[feat7])
accuracy = accuracy_score(y_test3, y_pred)
f1 = f1_score(y_test3, y_pred, average='weighted')  # Adjust 'average' as needed
precision = precision_score(y_test3, y_pred, average='weighted')  # Adjust 'average' as needed
recall = recall_score(y_test3, y_pred, average='weighted')  # Adjust 'average' as needed
roc_auc = roc_auc_score(y_test3, y_pred)  # ROC AUC is for binary classification
print("Best estimator metrics")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC:", roc_auc)

945 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\afons\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\afons\AppData\Local\Programs\Python\Python311\Lib\site-packages\imblearn\pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\afons\AppData\Local\Programs\Python\Python311\Lib\site-packages\imblearn\pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resamp

Best Hyperparameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100, 'resampler__sampling_strategy': 'auto', 'undersampler__sampling_strategy': 'auto'}
Mean Accuracy: 0.7059
Mean F1_score: 0.6302
Mean Precision: 0.6228
Mean Recall: 0.6421
Mean Roc_auc: 0.6946
Best estimator metrics
Accuracy: 0.7248157248157249
F1 Score: 0.7238498686774549
Precision: 0.7237686478192807
Recall: 0.7248157248157249
ROC AUC: 0.718220753793441
