# Global and Multiobjective Optimization project 


In [1]:
import pandas as pd 
import numpy as np

df = pd.read_csv("data/training_df_final.csv", sep=";", low_memory=False)

# Calculate minimum number of non-NaN values required (> 50% of columns)
min_non_nan = len(df.columns) * 0.5

# Keep rows that have MORE than 50% non-NaN values
df.dropna(thresh=min_non_nan + 1, inplace=True)
print(df.shape)
df.sample(5)

(10433, 126)


Unnamed: 0,ACID_TEST_X,CASH_SECU_M,CFL_M,CREDITOR_DY_Q,DEBTOR_DY_Q,DEPRA_M,SHRT_TRM_LIABI_M,STOCK_M,STOCK_TO_Q,SUM_ASSETS_M,...,TRD_CRED_M_mean,TRD_CRED_M_median,TURNOVER_M_mean,TURNOVER_M_median,WRK_CAP_M_mean,WRK_CAP_M_median,WRK_CAP_REQ_M_mean,WRK_CAP_REQ_M_median,target,year
5134,5.635,2550868.0,341768.0,64.0,142.0,248220.0,217343.0,567116.0,29.0,7506985.0,...,3264329.0,543539.0,10773810.0,2137356.0,100864.6,104551.5,-365913.0,67854.0,1,2015
7154,0.625,757981.0,951975.0,,52.0,67122.0,2712148.0,676223.0,39.0,5235190.0,...,2083662.0,494560.0,8154246.0,1529835.0,2002685.0,171312.0,1392524.0,91108.0,0,2017
9427,,,,,,,,,,,...,3760137.0,655697.0,17975170.0,2490692.0,1204295.0,96529.5,1476919.0,48821.0,1,2019
10766,1.281,10608.0,105697.0,,177.0,88422.0,807607.0,152333.0,49.0,1549968.0,...,3687741.0,668489.0,16825900.0,2520467.0,3641166.0,554650.0,1987091.0,246587.0,1,2022
9920,1.766,840458.0,140586.0,163.0,50.0,128570.0,254730.0,237153.0,34.0,3073528.0,...,1822735.0,327789.0,6705116.0,1214379.0,1534599.0,239984.0,971868.8,73892.5,1,2020


In [None]:
import sys 
import os

sys.path.append(os.path.abspath("src"))
from xgboost_model import (fit_model, oot_train_test_split, 
                           get_dummies_cols)

from helper_functions import evaluate_model

# Train Test split and Categorical Encoding 

X_train, X_test, y_train, y_test = oot_train_test_split(df, 
                                                    features_to_drop = None,
                                                    top_features = None, 
                                                    oot=2022)

source = X_train.copy()

Training vs Validation size: 9266 - 1167
Training set Class distribution:
TARGET
1    80.4
0    19.6
Name: count, dtype: float64
Test set Class distribution:
TARGET
1    77.6
0    22.4
Name: count, dtype: float64


## Preprocessing: Discretization
discretize the continous features based on their training set distribution 

In [None]:
from helper_functions import discretize_multiple_features

to_not_discretize = ['VAL_GRD_C', 'Sector', 'EX_POLICY_HOLDER', 'BELONG_GROUP', 
                     'is_company_italian', 'REGION_GROUP', 'year']

column_list = df.columns.tolist()
to_discretize = [col for col in column_list if col not in to_not_discretize]

In [4]:
X_train_discretized = discretize_multiple_features(source, X_train, to_discretize, n_bins=10)
X_test_discretized = discretize_multiple_features(source, X_test, to_discretize, n_bins=10)

cols = X_train_discretized.columns.tolist()
cols_to_retain = [i for i in cols if i.split('_')[-1] not in {'median', 'mean'}]
X_train_discretized = X_train_discretized[cols_to_retain]
X_test_discretized = X_test_discretized[cols_to_retain]

Prepared discretization for ACID_TEST_X
Prepared discretization for CASH_SECU_M
Prepared discretization for CFL_M
Prepared discretization for CREDITOR_DY_Q
Prepared discretization for DEBTOR_DY_Q
Prepared discretization for DEPRA_M
Prepared discretization for SHRT_TRM_LIABI_M
Prepared discretization for STOCK_M
Prepared discretization for STOCK_TO_Q
Prepared discretization for SUM_ASSETS_M
Prepared discretization for SUM_FIX_ASSETS_M
Prepared discretization for SUM_REM_WAG_SAL_M
Prepared discretization for EXPO_TURNOVER_X
Prepared discretization for FINANCE_ASSETS_M
Prepared discretization for GEARING_RATIO_M
Prepared discretization for GR_PROF_M
Prepared discretization for INTA_ASSETS_M
Prepared discretization for INT_COVER_X
Prepared discretization for INT_PAID_M
Prepared discretization for LONG_TRM_LIABI_M
Prepared discretization for NON_TRADE_INCO_M
Prepared discretization for OPE_PROF_M
Prepared discretization for PRE_TAX_PRO_M
Prepared discretization for PROFIT_MARGIN_X
Prepared 

In [5]:
X_train_discretized, X_test_discretized = get_dummies_cols(X_train_discretized, X_test_discretized, 
                                                           cols = ['Sector', "REGION_GROUP"])

X_train_discretized.drop(columns=['year'], inplace=True)
X_test_discretized.drop(columns=['year'], inplace=True)

training_df = pd.concat([X_train_discretized, y_train], axis=1)
print(training_df.shape)

# casting boolean columns to int
bool_cols = [col for col in training_df.columns if training_df[col].dtype == 'bool']
training_df[bool_cols] = training_df[bool_cols].astype(int)

training_df.sample(5)

(9266, 130)


Unnamed: 0,VAL_GRD_C,EX_POLICY_HOLDER,BELONG_GROUP,is_company_italian,ACID_TEST_X_PERCENTILE,CASH_SECU_M_PERCENTILE,CFL_M_PERCENTILE,CREDITOR_DY_Q_PERCENTILE,DEBTOR_DY_Q_PERCENTILE,DEPRA_M_PERCENTILE,...,Sector_Paper,Sector_Paper-X,Sector_Retail,Sector_Services,Sector_Textiles,Sector_Transport,Sector_Unknown,REGION_GROUP_NORD,REGION_GROUP_SUD,TARGET
4561,6.0,1,0,1,,,,,,,...,0,1,0,0,0,0,0,0,0,1
9082,7.0,0,0,1,2.0,6.0,,,,0.0,...,0,0,0,0,0,0,0,1,0,1
7835,6.0,0,0,1,,,,,,,...,0,0,0,0,1,0,0,0,0,1
5630,4.0,0,1,1,0.0,1.0,9.0,5.0,2.0,9.0,...,1,0,0,0,0,0,0,0,0,1
4861,7.0,0,0,1,,,,,,,...,1,0,0,0,0,0,0,1,0,1


In [6]:
print(X_train_discretized.shape)
print(X_test_discretized.shape)

baseline_model, best_params = fit_model(X_train_discretized, y_train)
print("")
baseline_metrics = evaluate_model(baseline_model, X_test_discretized, y_test)

(9266, 129)
(1167, 129)

	Training Performance		Validation Performance
	--------------------		----------------------
[0]	validation_0-aucpr:0.85638	validation_1-aucpr:0.85661
[50]	validation_0-aucpr:0.90819	validation_1-aucpr:0.87884
[100]	validation_0-aucpr:0.91601	validation_1-aucpr:0.88240
[150]	validation_0-aucpr:0.92070	validation_1-aucpr:0.88209
[156]	validation_0-aucpr:0.92139	validation_1-aucpr:0.88262
Working with parameters: {'eta': 0.01, 'gamma': 0.1, 'max_depth': 6, 'subsample': 0.2, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'base_score': 0.5, 'eval_metric': 'aucpr', 'seed': 42, 'min_child_weight': 2, 'reg_alpha': 2, 'importance_type': 'gain', 'n_estimators': 106}
Done !

Accuracy: 0.7284
Precision: 0.8156
Recall: 0.8400
AUC: 0.6560


# Filter Approach

In [None]:
from Filter_GA import Filter_Genetic_Algorithm

filter_ga = Filter_Genetic_Algorithm(population_size=50, generations=100, mutation_rate=0.1, crossover_rate=0.9)
filter_ga_features, filter_ga_time_consumption = filter_ga.evolve(training_df, penalty=0.1, 
                                                                  no_improvement_threshold=25
                                                                  )

Generation 0: New best new individual with fitness = 0.0395
Generation 0: Best=0.0395, 
 Difference with full feature set=78
Generation 1: New best new individual with fitness = 0.0388
Generation 2: New best new individual with fitness = 0.0349
Generation 3: New best new individual with fitness = 0.0333
Generation 4: New best new individual with fitness = 0.0302
Generation 5: New best new individual with fitness = 0.0279
Generation 6: No improvement
Generation 7: No improvement
Generation 8: No improvement
Generation 9: No improvement
Generation 10: No improvement
Generation 10: Best=0.0279, 
 Difference with full feature set=93
Generation 11: No improvement
Generation 12: No improvement
Generation 13: No improvement
Generation 14: No improvement
Generation 15: No improvement
Generation 16: No improvement
Generation 17: No improvement
Generation 18: No improvement
Generation 19: No improvement
Generation 20: New best new individual with fitness = 0.0264
Generation 20: Best=0.0264, 
 Di

In [None]:
print(round(filter_ga_time_consumption / 60, 2), "minutes")

12.38 minutes


In [None]:
y_train = training_df['TARGET']
filter_ga_X_train = training_df[training_df.columns[:-1][np.where(filter_ga_features == 1)[0]]]
filter_ga_X_test = X_test_discretized[training_df.columns[:-1][np.where(filter_ga_features == 1)[0]]]

print(filter_ga_X_train.shape)
print(y_train.shape)

filter_ga_model, best_params = fit_model(filter_ga_X_train, y_train)
print("")
filter_ga_metrics = evaluate_model(filter_ga_model, filter_ga_X_test, y_test)

(9266, 30)
(9266,)

	Training Performance		Validation Performance
	--------------------		----------------------
[0]	validation_0-aucpr:0.84939	validation_1-aucpr:0.83794
[50]	validation_0-aucpr:0.89816	validation_1-aucpr:0.87431
[100]	validation_0-aucpr:0.90296	validation_1-aucpr:0.87483
[123]	validation_0-aucpr:0.90462	validation_1-aucpr:0.87482
Working with parameters: {'eta': 0.01, 'gamma': 0.1, 'max_depth': 6, 'subsample': 0.2, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'base_score': 0.5, 'eval_metric': 'aucpr', 'seed': 42, 'min_child_weight': 2, 'reg_alpha': 2, 'importance_type': 'gain', 'n_estimators': 73}
Done !

Accuracy: 0.7266
Precision: 0.7986
Recall: 0.8664
AUC: 0.6135


## Wrapper Method

In [13]:
from xgboost import XGBClassifier

params = {
    'eta': 0.01, 
    'gamma': 0.1, 
    'max_depth': 6, 
    'subsample': 0.2, 
    'colsample_bytree': 1, 
    'objective': 'binary:logistic', 
    'base_score': 0.5, 
    'eval_metric': 'aucpr', 
    'seed': 42, 
    'min_child_weight': 2, 
    'reg_alpha': 2, 
    'importance_type': 'gain', 
    'n_estimators': 221
}

model = XGBClassifier(**params, 
                      scale_pos_weight=(y_train.value_counts().sort_index()[0] / y_train.value_counts().sort_index()[1])
                      )


In [37]:
from Wrapper_GA import Wrapper_Genetic_Algorithm

wrapper_ga = Wrapper_Genetic_Algorithm(population_size=50, generations=100, mutation_rate=0.1, crossover_rate=0.9)
wrapper_ga_features, wrapper_ga_time_consumption = wrapper_ga.evolve(training_df, 0.1, model,
                                                                     no_improvement_threshold=25
                                                                     )

Generation 0: New best new individual with fitness = 0.7951
Generation 0: Best=0.7951, 
 Difference with full feature set=76
Generation 1: New best new individual with fitness = 0.7967
Generation 2: No improvement
Generation 3: New best new individual with fitness = 0.7999
Generation 4: New best new individual with fitness = 0.8009
Generation 5: New best new individual with fitness = 0.8025
Generation 6: New best new individual with fitness = 0.8067
Generation 7: New best new individual with fitness = 0.8070
Generation 8: No improvement
Generation 9: No improvement
Generation 10: No improvement
Generation 10: Best=0.8070, 
 Difference with full feature set=93
Generation 11: No improvement
Generation 12: No improvement
Generation 13: New best new individual with fitness = 0.8085
Generation 14: New best new individual with fitness = 0.8086
Generation 15: No improvement
Generation 16: No improvement
Generation 17: No improvement
Generation 18: New best new individual with fitness = 0.8098

In [38]:
print(round(wrapper_ga_time_consumption / 60, 2), "minutes")

50.38 minutes


In [39]:
y_train = training_df['TARGET']
# generation_features = train_df.columns[:-1][np.where(self.best_individual == 1)[0]]
wrapper_ga_X_train = training_df[training_df.columns[:-1][np.where(wrapper_ga_features == 1)[0]]]
wrapper_ga_X_test = X_test_discretized[training_df.columns[:-1][np.where(wrapper_ga_features == 1)[0]]]

print(wrapper_ga_X_train.shape)
print(y_train.shape)

wrapper_ga_model, best_params = fit_model(wrapper_ga_X_train, y_train)
print("")
wrapper_ga_metrics = evaluate_model(wrapper_ga_model, wrapper_ga_X_test, y_test)

(9266, 36)
(9266,)

	Training Performance		Validation Performance
	--------------------		----------------------
[0]	validation_0-aucpr:0.85487	validation_1-aucpr:0.84829
[50]	validation_0-aucpr:0.90056	validation_1-aucpr:0.87337
[100]	validation_0-aucpr:0.90727	validation_1-aucpr:0.87799
[150]	validation_0-aucpr:0.91047	validation_1-aucpr:0.87811
[200]	validation_0-aucpr:0.91319	validation_1-aucpr:0.87956
[220]	validation_0-aucpr:0.91437	validation_1-aucpr:0.87997
Working with parameters: {'eta': 0.01, 'gamma': 0.1, 'max_depth': 6, 'subsample': 0.2, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'base_score': 0.5, 'eval_metric': 'aucpr', 'seed': 42, 'min_child_weight': 2, 'reg_alpha': 2, 'importance_type': 'gain', 'n_estimators': 217}
Done !

Accuracy: 0.7249
Precision: 0.8176
Recall: 0.8311
AUC: 0.6533


# Benchmarks

## Accuracy

In [40]:
baseline_accuracy = baseline_metrics['accuracy']
filter_ga_accuracy = filter_ga_metrics['accuracy']
wrapper_ga_accuracy = wrapper_ga_metrics['accuracy']

print(f"Baseline Accuracy: {baseline_accuracy:.4f}")
print(f"Filter GA Accuracy: {filter_ga_accuracy:.4f}")
print(f"Wrapper GA Accuracy: {wrapper_ga_accuracy:.4f}")

Baseline Accuracy: 0.7284
Filter GA Accuracy: 0.7266
Wrapper GA Accuracy: 0.7249


## Precision

In [41]:
baseline_precision = baseline_metrics['precision']
filter_ga_precision = filter_ga_metrics['precision']
wrapper_ga_precision = wrapper_ga_metrics['precision']

print(f"Baseline Precision: {baseline_precision:.4f}")
print(f"Filter GA Precision: {filter_ga_precision:.4f}")
print(f"Wrapper GA Precision: {wrapper_ga_precision:.4f}")

Baseline Precision: 0.8156
Filter GA Precision: 0.7986
Wrapper GA Precision: 0.8176


## Recall

In [42]:
baseline_recall = baseline_metrics['recall']
filter_ga_recall = filter_ga_metrics['recall']
wrapper_ga_recall = wrapper_ga_metrics['recall']

print(f"Baseline Recall: {baseline_recall:.4f}")
print(f"Filter GA Recall: {filter_ga_recall:.4f}")
print(f"Wrapper GA Recall: {wrapper_ga_recall:.4f}")

Baseline Recall: 0.8400
Filter GA Recall: 0.8664
Wrapper GA Recall: 0.8311


## AUC-ROC

In [43]:
baseline_auc = baseline_metrics['auc']
filter_ga_auc = filter_ga_metrics['auc']
wrapper_ga_auc = wrapper_ga_metrics['auc']

print(f"Baseline AUC: {baseline_auc:.4f}")
print(f"Filter GA AUC: {filter_ga_auc:.4f}")
print(f"Wrapper GA AUC: {wrapper_ga_auc:.4f}")

Baseline AUC: 0.6560
Filter GA AUC: 0.6135
Wrapper GA AUC: 0.6533


# Fitness evolution 

In [1]:
import json

# Load JSON data from a file
with open('results/filter_evolution_log.json', 'r') as f:
    filter_data = json.load(f)

# Extract data
generations = [d["generation"] for d in filter_data]
fitness = [d["fitness"] for d in filter_data]
colors = ['blue' if d["improvement"] else 'red' for d in filter_data]

In [2]:
import plotly.graph_objects as go

# Create scatter plot
fig = go.Figure(data=go.Scatter(
    x=generations,
    y=fitness,
    mode='markers+lines',
    marker=dict(
        color=colors,
        size=10,
        line=dict(width=2, color='DarkSlateGrey')
    )
))

fig.update_layout(
    title="Fitness Evolution Over Generations (Filter GA)",
    xaxis_title="Generation",
    yaxis_title="Fitness",
    template="plotly_white"
)

fig

In [3]:
# Load JSON data from a file
with open('results/wrapper_evolution_log.json', 'r') as f:
    wrapper_data = json.load(f)

# Extract data
generations = [d["generation"] for d in wrapper_data]
fitness = [d["fitness_(precision)"] for d in wrapper_data]
colors = ['blue' if d["improvement"] else 'red' for d in wrapper_data]

In [4]:
# Create scatter plot
fig = go.Figure(data=go.Scatter(
    x=generations,
    y=fitness,
    mode='markers+lines',
    marker=dict(
        color=colors,
        size=10,
        line=dict(width=2, color='DarkSlateGrey')
    )
))

fig.update_layout(
    title="Fitness Evolution Over Generations (Filter GA)",
    xaxis_title="Generation",
    yaxis_title="Fitness",
    template="plotly_white"
)

fig