# 1. LIBRERIES

In [1]:
import os
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
import warnings
import pickle
import time
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, roc_curve, auc, roc_auc_score,
    matthews_corrcoef, balanced_accuracy_score, cohen_kappa_score, 
    log_loss, brier_score_loss
)
from sklearn.utils.class_weight import compute_class_weight

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
import xgboost as xgb
# import lightgbm as lgb
# from catboost import CatBoostClassifier

# 2. GPU CONFIGURATION

In [2]:
GPU_AVAILABLE = False
XGBOOST_GPU_PARAMS = {}

try:
    import tensorflow as tf
    gpus = tf.config.list_physical_devices('GPU')
    
    if gpus:
        print(f"\n✓ GPU DETECTED:")
        for i, gpu in enumerate(gpus):
            print(f"  • GPU {i}: {gpu.name}")
        
        GPU_AVAILABLE = True
        XGBOOST_GPU_PARAMS = {
            'tree_method': 'hist',
            'n_jobs': -1
        }
        print("  • XGBoost: Using CPU (more efficient for large datasets)")
        print("  • LightGBM: Using CPU")
        print("  • CatBoost: Using CPU")
    else:
        print("\n No GPU detected - using CPU")
        XGBOOST_GPU_PARAMS = {
            'tree_method': 'hist',
            'n_jobs': -1
        }
except:
    print("\n TensorFlow not available - using CPU for all models")
    XGBOOST_GPU_PARAMS = {
        'tree_method': 'hist',
        'n_jobs': -1
    }


 TensorFlow not available - using CPU for all models


# 3. CREATE FOLDER STRUCTURE

In [3]:
folders = [
    'db/02a_classical_models/saved_models',
    'db/02a_classical_models/predictions',
    'db/02a_classical_models/metrics',
    'db/02a_classical_models/metrics/by_architecture',
    'db/02a_classical_models/model_data/feature_importance',
    'db/02a_classical_models/model_data/confusion_matrices',
    'db/02a_classical_models/model_data/confusion_matrices/by_architecture',
    'db/02a_classical_models/model_data/roc_data',
    'db/02a_classical_models/model_data/roc_data/by_architecture',
    'db/02a_classical_models/model_data/hyperparameters',
    'db/02a_classical_models/model_data/architecture_comparisons',
    'db/02a_classical_models/comparative_tables'
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

# 4. LOAD CLEAN DATA

In [4]:
output_file = 'db/01_cleaned_data/displacement_vs_others_final.csv'
df = pd.read_csv(output_file)

print(f"\nData loaded: {len(df):,} records")
print(f"Columns: {df.columns.tolist()}")


Data loaded: 5,696,308 records
Columns: ['ESTADO_DEPTO', 'VIGENCIA', 'HECHO', 'SEXO', 'ETNIA', 'DISCAPACIDAD', 'CICLO_VITAL', 'EVENTOS', 'Desplazamiento_forzado_binaria', 'km_norte_sur', 'km_este_oeste', 'distancia_total']


# 5. DATA PREPARATION (FULL PIPELINE)

In [5]:
try:
    from category_encoders.target_encoder import TargetEncoder
    HAS_TARGET_ENCODER = True
except Exception:
    HAS_TARGET_ENCODER = False
    print("Category_encoders not available, will use frequency encoding")

predictor_vars = [# Categorical variables
                  'ESTADO_DEPTO',                    
                  'SEXO',            
                  'ETNIA',           
                  'DISCAPACIDAD',    
                  'CICLO_VITAL', 
                  # Numeric variables
                  'VIGENCIA',          
                  'EVENTOS',
                  'km_norte_sur', 
                  'km_este_oeste', 
                  'distancia_total'
                  ]

target_var = 'Desplazamiento_forzado_binaria'

In [6]:
df['EVENTOS'] = pd.to_numeric(df['EVENTOS'], errors='coerce')
df['VIGENCIA'] = pd.to_numeric(df['VIGENCIA'], errors='coerce')
df['km_norte_sur'] = pd.to_numeric(df['km_norte_sur'], errors='coerce')
df['km_este_oeste'] = pd.to_numeric(df['km_este_oeste'], errors='coerce')
df['distancia_total'] = pd.to_numeric(df['distancia_total'], errors='coerce')

In [7]:
missing = df[predictor_vars + [target_var]].isnull().sum()
if missing.sum() > 0:
    print(f"\nMissing values detected:")
    print(missing[missing > 0])
    df = df.dropna(subset=predictor_vars + [target_var])
    print(f"Records after removing missing: {len(df):,}")
else:
    print("\nNo missing values")


No missing values


In [8]:
X = df[predictor_vars].copy()
y = df[target_var].values

categorical_cols = ['SEXO', 'ETNIA', 'CICLO_VITAL', 'DISCAPACIDAD', 'ESTADO_DEPTO']
numeric_cols = ['EVENTOS', 'VIGENCIA', 'km_norte_sur', 'km_este_oeste', 'distancia_total']

print(f"\nDataset prepared:")
print(f"Features: {X.shape[1]}")
print(f"Samples: {len(X):,}")
print(f"Categorical: {categorical_cols}")
print(f"Numeric: {numeric_cols}")


Dataset prepared:
Features: 10
Samples: 5,696,308
Categorical: ['SEXO', 'ETNIA', 'CICLO_VITAL', 'DISCAPACIDAD', 'ESTADO_DEPTO']
Numeric: ['EVENTOS', 'VIGENCIA', 'km_norte_sur', 'km_este_oeste', 'distancia_total']


In [9]:
class_counts = pd.Series(y).value_counts()
print(f"\nClass distribution:")
print(f"Class 0 (Others): {class_counts.get(0,0):,} ({class_counts.get(0,0)/len(y)*100:.2f}%)")
print(f"Class 1 (Displacement): {class_counts.get(1,0):,} ({class_counts.get(1,0)/len(y)*100:.2f}%)")
if class_counts.get(1,0) > 0:
    print(f"Imbalance ratio: {class_counts.get(0,0)/class_counts.get(1,0):.2f}:1")


Class distribution:
Class 0 (Others): 3,565,834 (62.60%)
Class 1 (Displacement): 2,130,474 (37.40%)
Imbalance ratio: 1.67:1


## 5.1. CATEGORICAL CARDINALITY & ENCODING

In [10]:
n_rows = len(df)
cat_report = []

for col in categorical_cols:
    vals = X[col].astype(str).fillna("<<NA>>")
    n_unique = vals.nunique()
    rel_card = n_unique / n_rows
    top_freq = vals.value_counts(normalize=True).mul(100).head(5).to_dict()

    if n_unique <= 30 and rel_card < 0.001:
        method = 'onehot'
    elif n_unique <= 200 and rel_card < 0.01:
        method = 'ordinal'
    else:
        method = 'high_cardinality'

    cat_report.append({
        'variable': col,
        'n_unique': n_unique,
        'relative_cardinality': rel_card,
        'recommended': method
    })

cat_report_df = pd.DataFrame(cat_report)
cat_report_df.to_excel('db/02a_classical_models/model_data/categorical_cardinality_report.xlsx', 
                       index=False, engine='openpyxl')
print("Categorical analysis saved")

Categorical analysis saved


## 5.2. APPLYING CATEGORICAL TRANSFORMATIONS

In [11]:
encoded_parts = []
encoders_saved = {}

onehot_cols = cat_report_df[cat_report_df['recommended']=='onehot']['variable'].tolist()
if onehot_cols:
    print(f"OneHot for: {onehot_cols}")
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ohe_arr = ohe.fit_transform(X[onehot_cols].astype(str))
    ohe_cols = ohe.get_feature_names_out(onehot_cols)
    df_ohe = pd.DataFrame(ohe_arr, columns=ohe_cols, index=X.index)
    encoded_parts.append(df_ohe)
    encoders_saved['onehot'] = ohe

ordinal_cols = cat_report_df[cat_report_df['recommended']=='ordinal']['variable'].tolist()
if ordinal_cols:
    print(f"Ordinal for: {ordinal_cols}")
    ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    ord_arr = ord_enc.fit_transform(X[ordinal_cols].astype(str))
    df_ord = pd.DataFrame(ord_arr, columns=ordinal_cols, index=X.index)
    encoded_parts.append(df_ord)
    encoders_saved['ordinal'] = ord_enc

high_card_cols = cat_report_df[cat_report_df['recommended']=='high_cardinality']['variable'].tolist()
if high_card_cols:
    print(f"High-cardinality for: {high_card_cols}")
    if HAS_TARGET_ENCODER:
        print("Using TargetEncoder")
        te = TargetEncoder(cols=high_card_cols)
        df_te = te.fit_transform(X[high_card_cols].astype(str), y)
        encoded_parts.append(df_te)
        encoders_saved['target'] = te
    else:
        print("Using frequency encoding")
        df_freq_list = []
        for col in high_card_cols:
            freq = X[col].astype(str).value_counts(normalize=True)
            df_freq_list.append(X[col].astype(str).map(freq).rename(col + "_freq"))
        df_freq = pd.concat(df_freq_list, axis=1)
        encoded_parts.append(df_freq)
        encoders_saved['frequency'] = True

covered = set(onehot_cols + ordinal_cols + high_card_cols)
remaining = [c for c in categorical_cols if c not in covered]
if remaining:
    print(f"Remaining (fallback ordinal): {remaining}")
    ord_enc2 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    df_rem = pd.DataFrame(ord_enc2.fit_transform(X[remaining].astype(str)), 
                         columns=remaining, index=X.index)
    encoded_parts.append(df_rem)
    encoders_saved['ordinal_fallback'] = ord_enc2

X_categorical = pd.concat(encoded_parts, axis=1) if encoded_parts else pd.DataFrame(index=X.index)
print(f"Encoded categorical shape: {X_categorical.shape}")
joblib.dump(encoders_saved, 'db/02a_classical_models/saved_models/categorical_encoders.pkl')

OneHot for: ['SEXO', 'ETNIA', 'CICLO_VITAL', 'DISCAPACIDAD']
Ordinal for: ['ESTADO_DEPTO']
Encoded categorical shape: (5696308, 24)


['db/02a_classical_models/saved_models/categorical_encoders.pkl']

## 5.3. NUMERIC ANALYSIS & SCALING

In [12]:
numeric_report = []
scalers = {}

for col in numeric_cols:
    s = X[col].dropna().astype(float)
    n_obs = len(s)
    n_unique = s.nunique()
    mean, std = s.mean(), s.std()
    minimum, maximum = s.min(), s.max()
    skew = s.skew()
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    outliers = ((s < lower) | (s > upper)).sum()
    outlier_ratio = outliers / n_obs if n_obs>0 else 0.0

    log_transform = (abs(skew) > 2) and (minimum >= 0)

    if outlier_ratio > 0.05:
        scaler_choice, scaler = 'RobustScaler', RobustScaler()
    else:
        if (minimum >= 0) and (maximum <= 1e6):
            scaler_choice, scaler = 'MinMaxScaler', MinMaxScaler()
        else:
            scaler_choice, scaler = 'StandardScaler', StandardScaler()

    col_data = X[[col]].astype(float).copy()
    if log_transform:
        col_data = np.log1p(col_data.clip(lower=0))
        col_data = col_data.replace([np.inf, -np.inf], np.nan).fillna(0)

    scaled = scaler.fit_transform(col_data.fillna(0))
    X[col] = pd.Series(scaled.ravel(), index=X.index)

    numeric_report.append({
        'variable': col,
        'n_obs': int(n_obs),
        'n_unique': int(n_unique),
        'mean': float(mean),
        'std': float(std),
        'min': float(minimum),
        'max': float(maximum),
        'skew': float(skew),
        'outlier_ratio': float(outlier_ratio),
        'log_transform_applied': bool(log_transform),
        'scaler_chosen': scaler_choice
    })
    scalers[col] = scaler

numeric_report_df = pd.DataFrame(numeric_report)
print(numeric_report_df)
numeric_report_df.to_excel('db/02a_classical_models/model_data/numeric_analysis.xlsx', 
                           index=False, engine='openpyxl')
print("Numeric analysis saved")
joblib.dump(scalers, 'db/02a_classical_models/saved_models/numeric_scalers.pkl')

          variable    n_obs  n_unique         mean         std          min  \
0          EVENTOS  5696308      7289    31.632610  292.888315     0.000000   
1         VIGENCIA  5696308        41  2007.242717   10.540738  1985.000000   
2     km_norte_sur  5696308        32   101.850859  352.537701  -974.554705   
3    km_este_oeste  5696308        32   -78.119007  213.316777  -846.071239   
4  distancia_total  5696308        32   367.378485  226.414383     0.000000   

            max       skew  outlier_ratio  log_transform_applied  \
0  35298.000000  36.828932       0.173847                   True   
1   2025.000000  -0.182433       0.000000                  False   
2    883.324167   0.241279       0.007131                  False   
3    730.382024   1.171035       0.019222                  False   
4   1217.988358   0.372751       0.008723                  False   

    scaler_chosen  
0    RobustScaler  
1    MinMaxScaler  
2  StandardScaler  
3  StandardScaler  
4    MinMaxScale

['db/02a_classical_models/saved_models/numeric_scalers.pkl']

## 5.4. COMBINING FEATURES

In [13]:
X_final = pd.concat([X_categorical.reset_index(drop=True), 
                     X[numeric_cols].reset_index(drop=True)], axis=1)
print(f"  • Final feature matrix: {X_final.shape}")

feature_names = X_final.columns.tolist()
pd.DataFrame({'Feature_Name': feature_names}).to_excel(
    'db/02a_classical_models/model_data/feature_names.xlsx', 
    index=False, engine='openpyxl')

  • Final feature matrix: (5696308, 29)


## 5.5. TRAIN/TEST SPLIT (70/30)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.30, random_state=42, stratify=y
)
print(f"Train: {len(X_train):,}")
print(f"Test: {len(X_test):,}")

split_info = pd.DataFrame({
    'Set': ['Train','Test'],
    'Size': [len(X_train), len(X_test)],
    'Class_0': [(y_train==0).sum(), (y_test==0).sum()],
    'Class_1': [(y_train==1).sum(), (y_test==1).sum()]
})
split_info.to_csv('db/02a_classical_models/model_data/train_test_split_info.csv', 
                  index=False)

print("Data preparation completed")

Train: 3,987,415
Test: 1,708,893
Data preparation completed


# 6. CLASS IMBALANCE HANDLING

In [15]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

print(f"Class weights: \n")
print(f"Class 0: {class_weight_dict[0]:.3f}")
print(f"Class 1: {class_weight_dict[1]:.3f}")


Class weights: 

Class 0: 0.799
Class 1: 1.337


# 7. HELPER FUNCTIONS FOR COMPREHENSIVE METRICS

In [16]:
def calculate_comprehensive_metrics(model, model_name, arch_name, X_train, X_test, 
                                   y_train, y_test, training_time, feature_names):
    """Calculate comprehensive metrics for a classification model."""
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    inference_start = time.time()
    _ = model.predict(X_test)
    inference_time = (time.time() - inference_start) / len(X_test) * 1000
    
    if hasattr(model, 'predict_proba'):
        y_prob_train = model.predict_proba(X_train)[:, 1]
        y_prob_test = model.predict_proba(X_test)[:, 1]
    else:
        y_prob_train = None
        y_prob_test = None
    
    cm_train = confusion_matrix(y_train, y_pred_train)
    cm_test = confusion_matrix(y_test, y_pred_test)
    tn, fp, fn, tp = cm_test.ravel()
    
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_precision = precision_score(y_train, y_pred_train, zero_division=0)
    test_precision = precision_score(y_test, y_pred_test, zero_division=0)
    train_recall = recall_score(y_train, y_pred_train, zero_division=0)
    test_recall = recall_score(y_test, y_pred_test, zero_division=0)
    train_f1 = f1_score(y_train, y_pred_train, zero_division=0)
    test_f1 = f1_score(y_test, y_pred_test, zero_division=0)
    
    train_specificity = cm_train[0, 0] / (cm_train[0, 0] + cm_train[0, 1]) if (cm_train[0, 0] + cm_train[0, 1]) > 0 else 0
    test_specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    train_gmean = np.sqrt(train_recall * train_specificity)
    test_gmean = np.sqrt(test_recall * test_specificity)
    
    train_mcc = matthews_corrcoef(y_train, y_pred_train)
    test_mcc = matthews_corrcoef(y_test, y_pred_test)
    
    train_balanced_acc = balanced_accuracy_score(y_train, y_pred_train)
    test_balanced_acc = balanced_accuracy_score(y_test, y_pred_test)
    
    train_kappa = cohen_kappa_score(y_train, y_pred_train)
    test_kappa = cohen_kappa_score(y_test, y_pred_test)
    
    if y_prob_test is not None:
        test_roc_auc = roc_auc_score(y_test, y_prob_test)
        test_log_loss = log_loss(y_test, y_prob_test)
        test_brier_score = brier_score_loss(y_test, y_prob_test)
    else:
        test_roc_auc = None
        test_log_loss = None
        test_brier_score = None
    
    model_size_mb = len(pickle.dumps(model)) / (1024 * 1024)
    n_features = X_train.shape[1]
    
    accuracy_gap = train_accuracy - test_accuracy
    f1_gap = train_f1 - test_f1
    precision_gap = train_precision - test_precision
    recall_gap = train_recall - test_recall
    
    metrics = {
        'Model': model_name,
        'Architecture': arch_name,
        
        'Train_Accuracy': train_accuracy,
        'Train_Precision': train_precision,
        'Train_Recall': train_recall,
        'Train_F1': train_f1,
        'Train_Specificity': train_specificity,
        'Train_G_Mean': train_gmean,
        'Train_Balanced_Accuracy': train_balanced_acc,
        'Train_MCC': train_mcc,
        'Train_Kappa': train_kappa,
        
        'Test_Accuracy': test_accuracy,
        'Test_Precision': test_precision,
        'Test_Recall': test_recall,
        'Test_F1': test_f1,
        'Test_Specificity': test_specificity,
        'Test_G_Mean': test_gmean,
        'Test_Balanced_Accuracy': test_balanced_acc,
        'Test_MCC': test_mcc,
        'Test_Kappa': test_kappa,
        
        'Test_ROC_AUC': test_roc_auc,
        'Test_Log_Loss': test_log_loss,
        'Test_Brier_Score': test_brier_score,
        
        'Test_True_Negatives': int(tn),
        'Test_False_Positives': int(fp),
        'Test_False_Negatives': int(fn),
        'Test_True_Positives': int(tp),
        'Test_TN_Percentage': (tn / len(y_test)) * 100,
        'Test_FP_Percentage': (fp / len(y_test)) * 100,
        'Test_FN_Percentage': (fn / len(y_test)) * 100,
        'Test_TP_Percentage': (tp / len(y_test)) * 100,
        
        'Accuracy_Gap_Train_Test': accuracy_gap,
        'F1_Gap_Train_Test': f1_gap,
        'Precision_Gap_Train_Test': precision_gap,
        'Recall_Gap_Train_Test': recall_gap,
        
        'Training_Time_Minutes': training_time,
        'Inference_Time_ms_per_sample': inference_time,
        'Model_Size_MB': model_size_mb,
        'Number_of_Features': n_features,
        
        'Total_Train_Samples': len(y_train),
        'Total_Test_Samples': len(y_test)
    }
    
    return metrics, y_pred_test, y_prob_test

# 8. DEFINING MODELS AND ARCHITECTURES

Due to the computer used for training not having a high capacity to train all models at once, a comment and uncomment strategy was implemented to train three architectures per model. To execute, the three architectures to be trained must be uncommented, and the rest must be left commented. All information is saved and will not be lost when training other models later.

In [None]:
models_config = {
    
    # ============================================
    # MODELO 1: LOGISTIC REGRESSION
    # ============================================
    # 'Logistic_Regression': {
    #     'architectures': [
    #         {
    #             'name': 'Logistic_Regression_Architecture_1',
    #             'params': {
    #                 'C': 0.1,
    #                 'penalty': 'l2',
    #                 'solver': 'lbfgs',
    #                 'max_iter': 1000,
    #                 'random_state': 42,
    #                 'class_weight': 'balanced',
    #                 'n_jobs': -1
    #             }
    #         },
    #         {
    #             'name': 'Logistic_Regression_Architecture_2',
    #             'params': {
    #                 'C': 1.0,
    #                 'penalty': 'l2',
    #                 'solver': 'saga',
    #                 'max_iter': 1000,
    #                 'random_state': 42,
    #                 'class_weight': 'balanced',
    #                 'n_jobs': -1
    #             }
    #         },
    #         {
    #             'name': 'Logistic_Regression_Architecture_3',
    #             'params': {
    #                 'C': 10.0,
    #                 'penalty': 'l2',
    #                 'solver': 'saga',
    #                 'max_iter': 1000,
    #                 'random_state': 42,
    #                 'class_weight': 'balanced',
    #                 'n_jobs': -1
    #             }
    #         }
        
    #     ],
    #     'model_class': LogisticRegression
    # },
    
    # ============================================
    # MODELO 2: RANDOM FOREST
    # ============================================
    # 'Random_Forest': {
    #     'architectures': [
    #         {
    #             'name': 'Random_Forest_Architecture_1',
    #             'params': {
    #                 'n_estimators': 100,  
    #                 'max_depth': 15,     
    #                 'min_samples_split': 5,
    #                 'min_samples_leaf': 2,
    #                 'max_features': 'sqrt',
    #                 'random_state': 42,
    #                 'class_weight': 'balanced',
    #                 'n_jobs': -1
    #             }
    #         },
    #         {
    #             'name': 'Random_Forest_Architecture_2',  
    #             'params': {
    #                 'n_estimators': 150,  
    #                 'max_depth': 20,      
    #                 'min_samples_split': 2,
    #                 'min_samples_leaf': 1,
    #                 'max_features': 'sqrt',  
    #                 'random_state': 42,
    #                 'class_weight': 'balanced',
    #                 'n_jobs': -1
    #             }
    #         },
    #         {
    #             'name': 'Random_Forest_Architecture_3',
    #             'params': {
    #                 'n_estimators': 220,       
    #                 'max_depth': 25,            
    #                 'min_samples_split': 2,
    #                 'min_samples_leaf': 1,
    #                 'max_features': 'sqrt',
    #                 'random_state': 42,
    #                 'class_weight': 'balanced',
    #                 'n_jobs': -1
    #             }
    #         },
    #     ],
    #     'model_class': RandomForestClassifier
    # },
    
    # ============================================
    # MODELO 3: XGBOOST
    # ============================================
    'XGBoost': {
        'architectures': [
           
            {
                'name': 'XGBoost_Architecture_1',
                'params': {
                    'n_estimators': 1800,
                    'max_depth': 14,
                    'learning_rate': 0.015,
                    'subsample': 0.65,
                    'colsample_bytree': 0.55,
                    'colsample_bylevel': 0.7,
                    'colsample_bynode': 0.8,
                    'gamma': 0.5,
                    'min_child_weight': 10,
                    'max_delta_step': 1,
                    'reg_alpha': 0.7,
                    'reg_lambda': 2.5,
                    'random_state': 42,
                    'eval_metric': 'logloss',
                    'scale_pos_weight': class_weight_dict[1]/class_weight_dict[0],
                    **XGBOOST_GPU_PARAMS
                }
            },
            {
                'name': 'XGBoost_Architecture_2',
                'params': {
                    'n_estimators': 2500,
                    'max_depth': 13,
                    'learning_rate': 0.01,
                    'subsample': 0.6,
                    'colsample_bytree': 0.5,
                    'colsample_bylevel': 0.65,
                    'colsample_bynode': 0.75,
                    'gamma': 0.6,
                    'min_child_weight': 12,
                    'max_delta_step': 2,
                    'reg_alpha': 1.0,
                    'reg_lambda': 3.0,
                    'random_state': 42,
                    'eval_metric': 'logloss',
                    'scale_pos_weight': class_weight_dict[1]/class_weight_dict[0],
                    **XGBOOST_GPU_PARAMS
                }
            },
            {
                'name': 'XGBoost_Architecture_3',
                'params': {
                    'n_estimators': 3000,
                    'max_depth': 15,
                    'learning_rate': 0.008,
                    'subsample': 0.6,
                    'colsample_bytree': 0.5,
                    'colsample_bylevel': 0.6,
                    'colsample_bynode': 0.7,
                    'gamma': 0.7,
                    'min_child_weight': 15,
                    'max_delta_step': 3,
                    'reg_alpha': 1.5,
                    'reg_lambda': 4.0,
                    'random_state': 42,
                    'eval_metric': 'logloss',
                    'scale_pos_weight': class_weight_dict[1]/class_weight_dict[0],
                    **XGBOOST_GPU_PARAMS
                }
            }

        ],
        'model_class': xgb.XGBClassifier
    },
    
    
}

active_models = list(models_config.keys())
if len(active_models) == 0:
    raise ValueError("ERROR: No active models")
elif len(active_models) > 1:
    raise ValueError(f"ERROR: There are {len(active_models)} active models: {active_models}. Only ONE at a time.")

print(f"Active model: {active_models[0]}")

Active model: XGBoost_0


# 9. TRAIN ARCHITECTURES WITH ACCUMULATION

In [18]:
for model_name, config in models_config.items():
    print(f"\n{'='*80}")
    print(f"MODEL: {model_name}")
    print(f"{'='*80}")
    
    architecture_results = []
    trained_models = []
    
    for arch_idx, architecture in enumerate(config['architectures'], 1):
        arch_name = architecture['name']
        arch_params = architecture['params']
        
        print(f"\n{'-'*80}")
        print(f"[{arch_idx}/3] Training: {arch_name}")
        print(f"{'-'*80}")
        
        start_time = datetime.now()
        model = config['model_class'](**arch_params)
        
        print(f"Training...")
        model.fit(X_train, y_train)
        
        training_time = (datetime.now() - start_time).total_seconds() / 60
        
        print(f"Calculating metrics...")
        metrics, y_pred_test, y_prob_test = calculate_comprehensive_metrics(
            model, model_name, arch_name, X_train, X_test, 
            y_train, y_test, training_time, feature_names
        )
        
        print(f"\nRESULTS:")
        print(f"Training time: {metrics['Training_Time_Minutes']:.2f} min \n")

        print(f"Train Precision: {metrics['Train_Precision']:.4f}")
        print(f"Train Recall: {metrics['Train_Recall']:.4f}")
        print(f"Train Accuracy: {metrics['Train_Accuracy']:.4f}")
        print(f"Train F1: {metrics['Train_F1']:.4f}")
        print(f"Train MCC: {metrics['Train_MCC']:.4f}\n")


        print(f"Test Precision: {metrics['Test_Precision']:.4f}")
        print(f"Test Recall: {metrics['Test_Recall']:.4f}")
        print(f"Test Accuracy: {metrics['Test_Accuracy']:.4f}")
        print(f"Test F1: {metrics['Test_F1']:.4f}")
        print(f"Test MCC: {metrics['Test_MCC']:.4f}")
        print(f"Test ROC AUC: {metrics['Test_ROC_AUC']:.4f}\n")
        
        
        architecture_results.append(metrics)
        trained_models.append({
            'architecture': arch_name,
            'model': model,
            'metrics': metrics,
            'y_pred_test': y_pred_test,
            'y_prob_test': y_prob_test
        })
        
        # Save individual architecture
        metrics_df = pd.DataFrame([metrics])
        metrics_df.to_excel(f'db/02a_classical_models/metrics/by_architecture/{model_name}_{arch_name}_comprehensive_metrics.xlsx', 
                           index=False, engine='openpyxl')
        
        cm = confusion_matrix(y_test, y_pred_test)
        cm_df = pd.DataFrame(cm, index=['True_Others', 'True_Displacement'],
                             columns=['Pred_Others', 'Pred_Displacement'])
        cm_df.to_excel(f'db/02a_classical_models/model_data/confusion_matrices/by_architecture/{model_name}_{arch_name}_confusion_matrix.xlsx', 
                       engine='openpyxl')
        
        if y_prob_test is not None:
            fpr, tpr, thresholds = roc_curve(y_test, y_prob_test)
            roc_auc = auc(fpr, tpr)
            roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Thresholds': thresholds, 'AUC': roc_auc})
            roc_df.to_excel(f'db/02a_classical_models/model_data/roc_data/by_architecture/{model_name}_{arch_name}_roc_curve.xlsx', 
                           index=False, engine='openpyxl')
        
        report = classification_report(y_test, y_pred_test, target_names=['Others', 'Displacement'], output_dict=True)
        report_df = pd.DataFrame(report).transpose()
        report_df.to_excel(f'db/02a_classical_models/metrics/by_architecture/{model_name}_{arch_name}_classification_report.xlsx', 
                          engine='openpyxl')
        
        params_df = pd.DataFrame([arch_params])
        params_df.to_excel(f'db/02a_classical_models/model_data/hyperparameters/{model_name}_{arch_name}_hyperparameters.xlsx', 
                          index=False, engine='openpyxl')
        
        print(f"Architecture data saved")
    
    # Compare architectures
    print(f"\n{'='*80}")
    print(f"COMPARING ARCHITECTURES")
    print(f"{'='*80}")

    comparison_df = pd.DataFrame(architecture_results).sort_values('Test_F1', ascending=False)
    comparison_df.to_excel(f'db/02a_classical_models/model_data/architecture_comparisons/{model_name}_architectures_comparison.xlsx', 
                          index=False, engine='openpyxl')
    
    print(f"Architectures ranked:")
    for idx, row in comparison_df.iterrows():
        print(f"  {idx+1}. {row['Architecture']}: F1={row['Test_F1']:.4f}")
    
    best_arch_name = comparison_df.iloc[0]['Architecture']
    print(f"\nBEST: {best_arch_name}")
    
    best_model_data = None
    for trained_data in trained_models:
        if trained_data['architecture'] == best_arch_name:
            best_model_data = trained_data
            break

    # Save best model
    print(f"\n{'='*80}")
    print(f"SAVING BEST MODEL")
    print(f"{'='*80}")

    comparison_df = pd.DataFrame(architecture_results).sort_values('Test_F1', ascending=False)
    comparison_df.to_excel(f'db/02a_classical_models/model_data/architecture_comparisons/{model_name}_architectures_comparison.xlsx', 
                          index=False, engine='openpyxl')
    
    print(f"\nArchitectures ranked:")
    for idx, row in comparison_df.iterrows():
        print(f"  {idx+1}. {row['Architecture']}: F1={row['Test_F1']:.4f}")
    
    best_arch_name = comparison_df.iloc[0]['Architecture']
    print(f"\nBEST: {best_arch_name}")
    
    best_model_data = None
    for trained_data in trained_models:
        if trained_data['architecture'] == best_arch_name:
            best_model_data = trained_data
            break
    
    # Save best model
    print(f"\n{'='*80}")
    print(f"SAVING BEST MODEL")
    print(f"{'='*80}")
    
    joblib.dump(best_model_data['model'], f'db/02a_classical_models/saved_models/{model_name}_best_model.pkl')
    
    predictions_df = pd.DataFrame({
        'Sample_Index': range(len(y_test)),
        'True_Label': y_test,
        'Predicted_Label': best_model_data['y_pred_test'],
        'Correct': y_test == best_model_data['y_pred_test']
    })
    if best_model_data['y_prob_test'] is not None:
        predictions_df['Probability_Class_1'] = best_model_data['y_prob_test']
    predictions_df.to_csv(f'db/02a_classical_models/predictions/{model_name}_best_predictions.csv', index=False)
    
    cm = confusion_matrix(y_test, best_model_data['y_pred_test'])
    cm_df = pd.DataFrame(cm, index=['True_Others', 'True_Displacement'], 
                         columns=['Pred_Others', 'Pred_Displacement'])
    cm_df.to_excel(f'db/02a_classical_models/model_data/confusion_matrices/{model_name}_best_confusion_matrix.xlsx', 
                   engine='openpyxl')
    
    if best_model_data['y_prob_test'] is not None:
        fpr, tpr, thresholds = roc_curve(y_test, best_model_data['y_prob_test'])
        roc_auc = auc(fpr, tpr)
        roc_df = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Thresholds': thresholds, 'AUC': roc_auc})
        roc_df.to_excel(f'db/02a_classical_models/model_data/roc_data/{model_name}_best_roc_curve.xlsx', 
                       index=False, engine='openpyxl')
    
    if hasattr(best_model_data['model'], 'feature_importances_'):
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': best_model_data['model'].feature_importances_
        }).sort_values('Importance', ascending=False)
        importance_df.to_excel(f'db/02a_classical_models/model_data/feature_importance/{model_name}_best_feature_importance.xlsx', 
                              index=False, engine='openpyxl')
    elif hasattr(best_model_data['model'], 'coef_'):
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Coefficient': best_model_data['model'].coef_[0]
        })
        importance_df['Abs_Coefficient'] = importance_df['Coefficient'].abs()
        importance_df = importance_df.sort_values('Abs_Coefficient', ascending=False)
        importance_df.to_excel(f'db/02a_classical_models/model_data/feature_importance/{model_name}_best_coefficients.xlsx', 
                              index=False, engine='openpyxl')
    
    pd.DataFrame([best_model_data['metrics']]).to_excel(
        f'db/02a_classical_models/metrics/{model_name}_best_comprehensive_metrics.xlsx', 
        index=False, engine='openpyxl')
    
    report = classification_report(y_test, best_model_data['y_pred_test'], 
                                   target_names=['Others', 'Displacement'], output_dict=True)
    pd.DataFrame(report).transpose().to_excel(
        f'db/02a_classical_models/metrics/{model_name}_best_classification_report.xlsx', 
        engine='openpyxl')
    
    best_params = None
    for arch in config['architectures']:
        if arch['name'] == best_arch_name:
            best_params = arch['params']
            break
    if best_params:
        pd.DataFrame([best_params]).to_excel(
            f'db/02a_classical_models/model_data/hyperparameters/{model_name}_best_hyperparameters.xlsx', 
            index=False, engine='openpyxl')
    
    print(f"Best model saved")
    
    # ACCUMULATION
    print(f"\n{'='*80}")
    print(f"UPDATING ACCUMULATIVE SUMMARIES")
    print(f"{'='*80}")
    
    # All architectures
    all_arch_path = 'db/02a_classical_models/metrics/all_architectures_tested_comprehensive.xlsx'
    new_arch_results = pd.DataFrame(architecture_results)
    
    if os.path.exists(all_arch_path):
        print(f"Loading previous architectures...")
        previous_arch = pd.read_excel(all_arch_path)
        previous_arch = previous_arch[previous_arch['Model'] != model_name]
        all_arch_combined = pd.concat([previous_arch, new_arch_results], ignore_index=True)
        print(f"    • Previous: {len(previous_arch)} | New: {len(new_arch_results)} | Total: {len(all_arch_combined)}")
    else:
        print(f"Creating new file...")
        all_arch_combined = new_arch_results
        print(f"Total: {len(all_arch_combined)}")
    
    all_arch_combined.to_excel(all_arch_path, index=False, engine='openpyxl')
    print(f"  ✓ Saved: {all_arch_path}")
    
    # Best models
    best_models_path = 'db/02a_classical_models/comparative_tables/best_models_comparison_complete.xlsx'
    new_best_model = pd.DataFrame([best_model_data['metrics']])
    
    if os.path.exists(best_models_path):
        print(f"  Loading previous best models...")
        previous_best = pd.read_excel(best_models_path)
        previous_best = previous_best[previous_best['Model'] != model_name]
        best_combined = pd.concat([previous_best, new_best_model], ignore_index=True)
        best_combined = best_combined.sort_values('Test_F1', ascending=False).reset_index(drop=True)
        print(f"    • Previous: {len(previous_best)} | New: 1 | Total: {len(best_combined)}")
    else:
        print(f"Creating new file...")
        best_combined = new_best_model
        print(f"Total: {len(best_combined)}")
    
    best_combined.to_excel(best_models_path, index=False, engine='openpyxl')
    print(f"Saved: {best_models_path}")
    
    # Publication-ready
    pub_cols = [
        'Model', 'Architecture', 'Test_Accuracy', 'Test_Precision', 'Test_Recall', 'Test_F1',
        'Test_Specificity', 'Test_G_Mean', 'Test_MCC', 'Test_Balanced_Accuracy',
        'Test_ROC_AUC', 'Test_Kappa', 'Test_Log_Loss',
        'Training_Time_Minutes', 'Inference_Time_ms_per_sample',
        'Model_Size_MB', 'F1_Gap_Train_Test', 'Number_of_Features'
    ]
    pub_table = best_combined[pub_cols].copy()
    numeric_cols = pub_table.select_dtypes(include=[np.number]).columns
    pub_table[numeric_cols] = pub_table[numeric_cols].round(4)
    pub_table.to_excel('db/02a_classical_models/comparative_tables/best_models_comparison_publication_ready.xlsx', 
                       index=False, engine='openpyxl')
    
    print(f"\nAccumulation completed")
    print(f"Model added: {model_name}")
    print(f"Total models: {len(best_combined)}")


MODEL: XGBoost_0

--------------------------------------------------------------------------------
[1/3] Training: XGBoost_Architecture_1
--------------------------------------------------------------------------------
Training...
Calculating metrics...

RESULTS:
Training time: 0.29 min 

Train Precision: 0.6386
Train Recall: 0.7581
Train Accuracy: 0.7491
Train F1: 0.6932
Train MCC: 0.4886

Test Precision: 0.6385
Test Recall: 0.7579
Test Accuracy: 0.7490
Test F1: 0.6931
Test MCC: 0.4884
Test ROC AUC: 0.8352

Architecture data saved

--------------------------------------------------------------------------------
[2/3] Training: XGBoost_Architecture_2
--------------------------------------------------------------------------------
Training...
Calculating metrics...

RESULTS:
Training time: 0.71 min 

Train Precision: 0.7091
Train Recall: 0.8188
Train Accuracy: 0.8066
Train F1: 0.7600
Train MCC: 0.6038

Test Precision: 0.7089
Test Recall: 0.8182
Test Accuracy: 0.8064
Test F1: 0.7596
Tes

# 10. TRAINING COMPLETED

In [19]:
best_models_path = 'db/02a_classical_models/comparative_tables/best_models_comparison_complete.xlsx'
if os.path.exists(best_models_path):
    current_best = pd.read_excel(best_models_path).sort_values('Test_F1', ascending=False)
    
    print(f"\nCURRENT STATE:")
    print(f"Total models: {len(current_best)}")
    print(f"\nTop 3:")
    for idx, row in current_best.head(3).iterrows():
        print(f"    {idx+1}. {row['Model']}: F1={row['Test_F1']:.4f}")
    
    print(f"\nAll trained:")
    for model in current_best['Model'].tolist():
        print(f"{model}")
else:
    print("\nNo models found")

print("NEXT: Comment current model, uncomment next, run again")


CURRENT STATE:
Total models: 4

Top 3:
    1. Random_Forest: F1=0.8943
    2. XGBoost: F1=0.8274
    3. XGBoost_0: F1=0.7973

All trained:
Random_Forest
XGBoost
XGBoost_0
Logistic_Regression
NEXT: Comment current model, uncomment next, run again
