In [2]:
# !pip install scikit-optimize

# Mount Google Drive (for Google Colab)
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# imports for model selection and grid search
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Integer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import hamming_loss, accuracy_score, f1_score, classification_report

In [4]:
# Update file path to match Google Drive directory structure
#file_path = '/content/drive/MyDrive/learning and projects/DS/project/data_to_ML_model.csv'
#df = pd.read_csv(file_path, delimiter=",", low_memory=False)

In [5]:
# Read the CSV file
file_path = r'C:\Users\ghiat\My Drive\learning and projects\DS\project\data_to_ML_model.csv'
df = pd.read_csv(file_path, delimiter=",", low_memory=False)

In [6]:
df.head()

Unnamed: 0,Returnees reported in the community,Three most common IDP shelter types - Solid finished apartment,Three most common IDP shelter types - Unfinished or abandoned residential building,Three most common IDP shelter types - Damaged residential building,Three most common IDP shelter types - Non residential structure with one family,Three most common IDP shelter types - Collective center with more than one family,Three most common IDP shelter types - Tent,Proportion of IDPs living in overcrowded shelters,IDP shelter inadequacies - Lack of lighting inside shelter,IDP shelter inadequacies - Lack of lighting around shelter,...,Livelihood support,Education,WASH,Winterisation,Legal services,GBV services,CP services,Explosive hazard risk awareness or removal of explosive contamination,Mental health psychological support,Cash assistance vouchers or cash in hand
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
target_columns = [
    "Shelter",
    "Health",
    "NFIs",
    "Electricity assistance",
    "Food, nutrition",
    "Agricultural supplies",
    "Livelihood support",
    "Education",
    "WASH",
    "Winterisation",
    "Legal services",
    "GBV services",
    "CP services",
    "Explosive hazard risk awareness or removal of explosive contamination",
    "Mental health psychological support",
    "Cash assistance vouchers or cash in hand"
]

In [8]:
# Extract the target DataFrame
target = df[target_columns]

# df_feature = df.drop(columns=target.columns)
df_feature = df.drop(target_columns, axis=1)

In [9]:
for i in target.columns:
    print (i,target[i].value_counts(normalize=True).round(2))

Shelter Shelter
0.0    0.99
1.0    0.01
Name: proportion, dtype: float64
Health Health
0.0    0.76
1.0    0.24
Name: proportion, dtype: float64
NFIs NFIs
0.0    0.99
1.0    0.01
Name: proportion, dtype: float64
Electricity assistance Electricity assistance
0.0    0.99
1.0    0.01
Name: proportion, dtype: float64
Food, nutrition Food, nutrition
0.0    0.57
1.0    0.43
Name: proportion, dtype: float64
Agricultural supplies Agricultural supplies
0.0    0.99
1.0    0.01
Name: proportion, dtype: float64
Livelihood support Livelihood support
0.0    0.98
1.0    0.02
Name: proportion, dtype: float64
Education Education
0.0    0.97
1.0    0.03
Name: proportion, dtype: float64
WASH WASH
0.0    0.85
1.0    0.15
Name: proportion, dtype: float64
Winterisation Winterisation
0.0    0.98
1.0    0.02
Name: proportion, dtype: float64
Legal services Legal services
0.0    0.99
1.0    0.01
Name: proportion, dtype: float64
GBV services GBV services
0.0    1.0
1.0    0.0
Name: proportion, dtype: float64
CP s

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(df_feature, target, test_size=0.2, random_state=1234)

# Support Vector Machine

In [11]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import hamming_loss, accuracy_score, classification_report
from skopt import BayesSearchCV

# Create a nested ensemble: AdaBoost (boosting) of Bagging (bagging) of LinearSVC (SVM)
nested_model = MultiOutputClassifier(
    AdaBoostClassifier(
        estimator=BaggingClassifier(
            estimator=LinearSVC(random_state=1234, class_weight='balanced'),
            random_state=1234
        ),
        algorithm='SAMME',
        random_state=1234
    )
)

# Define the search space for all nested parameters:
# - AdaBoost: n_estimators and learning_rate
# - Bagging: n_estimators and max_samples
# - LinearSVC: C and max_iter
search_spaces = {
    "estimator__n_estimators": (10, 100),                    # AdaBoost n_estimators
    "estimator__learning_rate": (0.01, 2.0, 'log-uniform'),    # AdaBoost learning_rate
    "estimator__estimator__n_estimators": (10, 100),           # Bagging n_estimators
    "estimator__estimator__max_samples": (0.5, 1.0, 'uniform'), # Bagging max_samples
    "estimator__estimator__estimator__C": (0.1, 10.0, 'log-uniform'),  # LinearSVC C
    "estimator__estimator__estimator__max_iter": [1000, 3000, 5000, 7000]  # LinearSVC max_iter
}

# Set up Bayesian hyperparameter search across the nested ensemble
bayes_search = BayesSearchCV(
    estimator=nested_model,
    search_spaces=search_spaces,
    n_iter=4,
    cv=3,
    scoring='accuracy',
    random_state=1234,
    n_jobs=-1
)

# Fit the model with training data
bayes_search.fit(X_train, Y_train)

# Predict and evaluate on the test set
Y_pred = bayes_search.predict(X_test)

print("Boosting of Bagging SVM - Evaluation Metrics:")
print(f"Hamming Loss: {hamming_loss(Y_test, Y_pred)}")
print(f"Exact Match Accuracy: {accuracy_score(Y_test, Y_pred)}")
print("Classification Report:")
print(classification_report(Y_test, Y_pred, target_names=target_columns, zero_division=0))

Boosting of Bagging SVM - Evaluation Metrics:
Hamming Loss: 0.06362030988910831
Exact Match Accuracy: 0.537293027495063
Classification Report:
                                                                       precision    recall  f1-score   support

                                                              Shelter       0.08      0.61      0.15        31
                                                               Health       0.76      0.88      0.82      1638
                                                                 NFIs       0.06      0.70      0.12        67
                                               Electricity assistance       0.22      0.76      0.35        45
                                                      Food, nutrition       0.84      0.91      0.87      2862
                                                Agricultural supplies       0.06      0.57      0.10        47
                                                   Livelihood support       0.1

In [12]:
print("Samples with no true labels:", (Y_test.sum(axis=1) == 0).sum())
print("Samples with no predicted labels:", (Y_pred.sum(axis=1) == 0).sum())

Samples with no true labels: 2739
Samples with no predicted labels: 2763


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import hamming_loss, accuracy_score, classification_report
from skopt import BayesSearchCV

# Nested ensemble: AdaBoost (boosting) of Bagging (bagging) of RandomForestClassifier (random forest)
nested_model_rf = MultiOutputClassifier(
    AdaBoostClassifier(
        estimator=BaggingClassifier(
            estimator=RandomForestClassifier(random_state=1234, n_estimators=100, n_jobs=-1, class_weight='balanced'),
            random_state=1234
        ),
        algorithm='SAMME',
        random_state=1234
    )
)

# Define search spaces for each nested layer:
# AdaBoost parameters, Bagging parameters, and RandomForest parameters
search_spaces_rf = {
    "estimator__n_estimators": (10, 100),                      # AdaBoost n_estimators
    "estimator__learning_rate": (0.01, 2.0, 'log-uniform'),      # AdaBoost learning_rate
    "estimator__estimator__n_estimators": (10, 100),             # Bagging n_estimators
    "estimator__estimator__max_samples": (0.5, 1.0, 'uniform'),   # Bagging max_samples
    "estimator__estimator__estimator__n_estimators": (50, 200),   # RandomForest n_estimators
    "estimator__estimator__estimator__max_depth": (5, 20)         # RandomForest max_depth
}

# Set up Bayesian hyperparameter search for the nested ensemble
bayes_search_rf = BayesSearchCV(
    estimator=nested_model_rf,
    search_spaces=search_spaces_rf,
    n_iter=4,
    cv=3,
    scoring='accuracy',
    random_state=1234,
    n_jobs=-1
)

# Fit the model on training data and evaluate on the test set
bayes_search_rf.fit(X_train, Y_train)
Y_pred_rf = bayes_search_rf.predict(X_test)

print("Boosting of Bagging Random Forest - Evaluation Metrics:")
print(f"Hamming Loss: {hamming_loss(Y_test, Y_pred_rf)}")
print(f"Exact Match Accuracy: {accuracy_score(Y_test, Y_pred_rf)}")
print("Classification Report:")
print(classification_report(Y_test, Y_pred_rf, target_names=target_columns, zero_division=0))


In [None]:
print("Samples with no true labels:", (Y_test.sum(axis=1) == 0).sum())
print("Samples with no predicted labels:", (Y_pred_rf.sum(axis=1) == 0).sum())