In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, f1_score, confusion_matrix, classification_report,  roc_auc_score
import os
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from scipy import stats
import seaborn as sns
from itertools import product
import time

from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from functions import *

## Loading and preparing data

In [2]:
base_path = '/Users/leilapaolini/Documents/data'
full_path = os.path.join(base_path, 'data_y.npy')
data = np.load(full_path, allow_pickle=True)

print(f"Data type: {type(data)}")
print(f"Data shape: {data.shape if hasattr(data, 'shape') else 'N/A'}")

Data type: <class 'numpy.ndarray'>
Data shape: (9618522, 35)


In [3]:
df = pd.DataFrame(data, columns=['commune_1', 'commune_2', 'distance', 'year', 'pop_1', 'pop_2',
       'T_Mann', 'T_Frau', 'Etr_Total', 'Accidents dégâts matériels',
       'Accidents avec dommages corporels', 'Morts', 'GEM_FLAECH',
       'EINWOHNERZ', '0-25', '25-65', '65+', 'canton_code', 'unemployment',
       'gdp', 'T_Mann_2', 'T_Frau_2', 'Etr_Total_2',
       'Accidents dégâts matériels_2', 'Accidents avec dommages corporels_2',
       'Morts_2', 'GEM_FLAECH_2', 'EINWOHNERZ_2', '0-25_2', '25-65_2', '65+_2',
       'canton_code_2', 'unemployment_2', 'gdp_2', 'flow'])

df.head()



Unnamed: 0,commune_1,commune_2,distance,year,pop_1,pop_2,T_Mann,T_Frau,Etr_Total,Accidents dégâts matériels,...,Morts_2,GEM_FLAECH_2,EINWOHNERZ_2,0-25_2,25-65_2,65+_2,canton_code_2,unemployment_2,gdp_2,flow
0,1.0,2.0,3062.019741,2018.0,1982.0,12229.0,982.0,1000.0,135.0,3.0,...,0.0,1059.0,11900.0,3062.0,6981.0,2472.0,1.0,21793.25,156883.07738,122.0
1,1.0,2.0,3062.019741,2020.0,1982.0,12229.0,994.0,1020.0,143.0,7.0,...,0.0,1059.0,12229.0,2951.0,7052.0,2551.0,1.0,26155.333333,149208.52863,79.0
2,1.0,3.0,4916.726674,2018.0,1982.0,5548.0,982.0,1000.0,135.0,3.0,...,0.0,743.0,5435.0,1532.0,3151.0,997.0,1.0,21793.25,156883.07738,10.0
3,1.0,3.0,4916.726674,2020.0,1982.0,5548.0,994.0,1020.0,143.0,7.0,...,1.0,743.0,5548.0,1551.0,3107.0,1050.0,1.0,26155.333333,149208.52863,29.0
4,1.0,4.0,4951.312604,2018.0,1982.0,3701.0,982.0,1000.0,135.0,3.0,...,1.0,1360.0,3571.0,971.0,2072.0,739.0,1.0,21793.25,156883.07738,17.0


In [4]:
not_log_transform = ['commune_1', 'commune_2', 'Morts','Morts_2', 'year', 'canton_code', 'unemployment','gdp','canton_code_2', 'unemployment_2', 'gdp_2', 'flow']
to_log = [col for col in df.columns if col not in not_log_transform]

df = features_change(df, to_log, "log")


In [5]:
df = features_engineering(df)

Column for multiplied population added
Column for population ratio added
Column for difference between gdp of the communes added
Column for distance between the communes squared added
Column for gravitation added


In [6]:
drop_features = ['commune_1', 'commune_2', 'flow']
X_train_clf, y_train_clf, X_val_clf, y_val_clf, X_test_clf, y_test_clf = build_train_test_val(df, 
                                                                      test_canton_ids=[3], 
                                                                      val_canton_ids=[19], 
                                                                      zero_drop_ratio=0, 
                                                                      random_state=37, 
                                                                      features=drop_features,
                                                                      classify=True)

Splitting with canton [3] as test set:
Total flows: 9,618,522
Train size: 8,912,666 rows (92.7%)
Test size: 705,856 rows (7.3%)
Splitting with canton [19] as test set:
Total flows: 8,912,666
Train size: 7,227,626 rows (81.1%)
Test size: 1,685,040 rows (18.9%)
Dropping 0% of zeros in training data
Flow reclassified as 1: flow present, 0: no flow
Flow reclassified as 1: flow present, 0: no flow
Flow reclassified as 1: flow present, 0: no flow


In [7]:
X_train_reg, y_train_reg, X_val_reg, y_val_reg, X_test_reg, y_test_reg = build_train_test_val(df, 
                                                                      test_canton_ids=[3], 
                                                                      val_canton_ids=[19], 
                                                                      zero_drop_ratio=0, 
                                                                      random_state=37, 
                                                                      features=drop_features,
                                                                      classify=True)

Splitting with canton [3] as test set:
Total flows: 9,618,522
Train size: 8,912,666 rows (92.7%)
Test size: 705,856 rows (7.3%)
Splitting with canton [19] as test set:
Total flows: 8,912,666
Train size: 7,227,626 rows (81.1%)
Test size: 1,685,040 rows (18.9%)
Dropping 0% of zeros in training data
Flow reclassified as 1: flow present, 0: no flow
Flow reclassified as 1: flow present, 0: no flow
Flow reclassified as 1: flow present, 0: no flow


In [8]:
X_train_clf, X_test_clf, X_val_clf = binary_exp_cantons(X_train_clf, X_test_clf, X_val_clf)
X_train_clf.head()

Unnamed: 0,distance,pop_1,pop_2,T_Mann,T_Frau,Etr_Total,Accidents dégâts matériels,Accidents avec dommages corporels,Morts,GEM_FLAECH,...,canton_2_16.0,canton_2_17.0,canton_2_18.0,canton_2_20.0,canton_2_21.0,canton_2_22.0,canton_2_23.0,canton_2_24.0,canton_2_25.0,canton_2_26.0
0,137.799059,8.319474,6.728629,7.601402,7.650645,5.950643,2.639058,2.079443,0.0,6.381816,...,False,False,False,False,False,False,False,False,False,False
1,121.479057,9.008469,6.542472,8.341649,8.314342,7.058758,4.804021,3.496508,0.0,7.123673,...,False,False,False,True,False,False,False,False,False,False
2,105.489526,7.271009,6.917706,6.57368,6.598509,4.820282,1e-05,-11.512925,0.0,5.587249,...,False,False,False,False,False,False,False,False,False,True
3,108.055025,5.587249,6.359574,5.023881,5.010635,2.890372,0.693152,-11.512925,0.0,5.937536,...,False,False,False,False,False,False,False,False,False,False
4,116.318767,5.384495,6.196444,4.634729,4.75359,2.639058,-11.512925,1e-05,0.0,6.666957,...,False,False,False,False,False,False,False,False,False,False


In [9]:
X_train_reg, X_test_reg, X_val_reg = binary_exp_cantons(X_train_reg, X_test_reg, X_val_reg)
X_train_reg.head()

Unnamed: 0,distance,pop_1,pop_2,T_Mann,T_Frau,Etr_Total,Accidents dégâts matériels,Accidents avec dommages corporels,Morts,GEM_FLAECH,...,canton_2_16.0,canton_2_17.0,canton_2_18.0,canton_2_20.0,canton_2_21.0,canton_2_22.0,canton_2_23.0,canton_2_24.0,canton_2_25.0,canton_2_26.0
0,137.799059,8.319474,6.728629,7.601402,7.650645,5.950643,2.639058,2.079443,0.0,6.381816,...,False,False,False,False,False,False,False,False,False,False
1,121.479057,9.008469,6.542472,8.341649,8.314342,7.058758,4.804021,3.496508,0.0,7.123673,...,False,False,False,True,False,False,False,False,False,False
2,105.489526,7.271009,6.917706,6.57368,6.598509,4.820282,1e-05,-11.512925,0.0,5.587249,...,False,False,False,False,False,False,False,False,False,True
3,108.055025,5.587249,6.359574,5.023881,5.010635,2.890372,0.693152,-11.512925,0.0,5.937536,...,False,False,False,False,False,False,False,False,False,False
4,116.318767,5.384495,6.196444,4.634729,4.75359,2.639058,-11.512925,1e-05,0.0,6.666957,...,False,False,False,False,False,False,False,False,False,False


In [10]:
print(X_train_clf.shape)
print(X_test_clf.shape)
print(X_val_clf.shape)

(7227626, 83)
(705856, 87)
(1685040, 85)


In [11]:
X_val_clf  = X_val_clf.reindex(columns=X_train_clf.columns, fill_value=0)
X_test_clf = X_test_clf.reindex(columns=X_train_clf.columns, fill_value=0)

print(X_train_clf.shape, X_val_clf.shape, X_test_clf.shape)

assert X_train_clf.shape[1] == X_val_clf.shape[1] == X_test_clf.shape[1]
assert X_train_clf.columns.equals(X_val_clf.columns)
assert X_train_clf.columns.equals(X_test_clf.columns)

(7227626, 83) (1685040, 83) (705856, 83)


## Running models 

In [12]:
model_clf = xgb.XGBClassifier()
model_clf.load_model("xgboost_best_clf_model.json")

In [13]:
model_reg = xgb.XGBRegressor()
model_reg.load_model("xgboost_best_reg_model.json")

In [14]:
params = model_reg.get_params()
print(params)

{'objective': 'reg:squarederror', 'base_score': [2.8660066], 'booster': 'gbtree', 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': ['float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'int', 'int', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i'], 'feature_weights': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': 

In [19]:
# as we had saved right after the grid search before the final retrain we do a final retrain for both models 
X_full_clf = pd.concat([X_train_clf, X_val_clf])
y_full_clf = pd.concat([y_train_clf, y_val_clf])


In [20]:
model_clf.fit(X_full_clf, y_full_clf)

0,1,2
,objective,'binary:logistic'
,base_score,[0.5]
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [21]:
# best threshold found before
best_threshold = 0.9500

y_pred_clf = model_clf.predict(X_test_clf)
y_proba_clf = model_clf.predict_proba(X_test_clf)
y_proba_1_clf = y_proba[:, 1]
y_pred_custom_clf = (y_proba_1 >= best_threshold).astype(int)


NameError: name 'y_proba' is not defined

In [None]:
y_pred_log_reg = model_reg.predict(X_test)
y_pred_reg = np.expm1(y_pred_log_reg)

## Results 

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

acc = accuracy_score(y_test_clf, y_pred_custom)
prec = precision_score(y_test_clf, y_pred_custom, zero_division=0)
rec = recall_score(y_test_clf, y_pred_custom)
f1 = f1_score(y_test_clf, y_pred_custom)
auc = roc_auc_score(y_test_clf, y_proba_1)

print(f"Accuracy : {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall   : {rec:.3f}")
print(f"F1-score : {f1:.3f}")
print(f"ROC AUC  : {auc:.3f}")


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# indices where classifier predicts positive
idx_pred_1 = np.where(y_pred_custom == 1)[0]

# subset
X_test_pos = X_test.iloc[idx_pred_1]
y_test_reg_pos = y_test_reg.iloc[idx_pred_1]

# regression prediction
y_pred_log_pos = model_reg.predict(X_test_pos)
y_pred_pos = np.expm1(y_pred_log_pos)

# regression metrics
mae = mean_absolute_error(y_test_reg_pos, y_pred_pos)
rmse = mean_squared_error(y_test_reg_pos, y_pred_pos, squared=False)
r2 = r2_score(y_test_reg_pos, y_pred_pos)

print(f"MAE  (pos only): {mae:.3f}")
print(f"RMSE (pos only): {rmse:.3f}")
print(f"R²   (pos only): {r2:.3f}")
