### a) data exploration

In [1]:
import pandas as pd
from pathlib import Path


project_path = Path.cwd()  


data_path = project_path / "data"

X_train = pd.read_parquet(data_path / "X_train.parquet")
y_train = pd.read_parquet(data_path / "y_train.parquet")
X_test = pd.read_parquet(data_path / "X_test.parquet")
y_test = pd.read_parquet(data_path / "y_test.parquet")

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4250, 500) (4250, 1)
(750, 500) (750, 1)


In [2]:
Gain_Max_Households = 10*1000
Max_Variables_Possibles = Gain_Max_Households // 200 
print("Max Variables Possible: ", Max_Variables_Possibles)

Max Variables Possible:  50


In [3]:
import numpy as np

def compute_scaled_gain(
    y_true,
    y_pred_proba,
    num_features,
    gain_per_tp=10,
    selection_ratio=0.2,
    max_reward=10000,
    feature_cost=200,
):
    """
    Compute a scaled gain similar to the XGBoost approach.

    Parameters:
    -----------
    y_true : array-like
        Ground truth binary labels (0 or 1).
    y_pred_proba : array-like
        Predicted probabilities for the positive class.
    num_features : int
        Number of selected features used by the model.
    gain_per_tp : int, default=10
        Gain earned for each true positive selected.
    selection_ratio : float, default=0.2
        Fraction of instances to select (top K by predicted probability).
    max_reward : int, default=10000
        Scaling factor for maximum reward (e.g. if all top K are correct).
    feature_cost : int, default=200
        Cost per selected feature.

    Returns:
    --------
    scaled_gain : float
        The estimated scaled gain for this configuration.
    """

    n_selected = int(selection_ratio * len(y_true))
    top_k_indices = np.argsort(y_pred_proba)[-n_selected:]
    true_positives = y_true[top_k_indices].sum()

    reward = (true_positives * gain_per_tp * max_reward) / (n_selected * gain_per_tp)
    cost = num_features * feature_cost

    return reward - cost

## First algorithm : Logistic regression features + SVM 

In [4]:

from sklearn.linear_model import LogisticRegression

import numpy as np



def select_features_L1(X_train, y_train, C_feat):
    model = LogisticRegression(
        penalty="l1", solver="liblinear", C=C_feat, random_state=0
    )
    model.fit(X_train, y_train)
    coefs = model.coef_.ravel()
    selected_indices = np.where(coefs != 0)[0]
    return selected_indices

### grid-search implementation

In [6]:
import numpy as np
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


def compute_scaled_gain(
    y_true,
    y_pred_proba,
    num_features,
    gain_per_tp=10,
    selection_ratio=0.2,
    max_reward=10000,
    feature_cost=200,
):
    n_selected = int(selection_ratio * len(y_true))
    top_k_indices = np.argsort(y_pred_proba)[-n_selected:]
    true_positives = np.array(y_true)[top_k_indices].sum()
    reward = (true_positives * gain_per_tp * max_reward) / (n_selected * gain_per_tp)
    cost = num_features * feature_cost
    return reward - cost



C_feat_grid = [0.0005,  0.0007,  0.0009]
C_svm_grid = [ 0.01, 0.1, 1.0, 10.0]
kernel_options = ["rbf", "poly"]


best_config = None
best_gain = -np.inf

results = []

for kernel in kernel_options:
    for C_feat in C_feat_grid:
        for C_svm in C_svm_grid:
            print(f"\nTrying kernel={kernel}, C_feat = {C_feat}, C_svm = {C_svm}")
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            total_gain = 0

            for index, (train_idx, val_idx) in enumerate(
                skf.split(X_train, y_train), 1
            ):
                X_inner, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_inner, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

                selected_features = select_features_L1(
                    X_inner.values, y_inner.values.ravel(), C_feat
                )
                num_selected = len(selected_features)

                if num_selected == 0:
                    print(f"  Fold {index}: no features selected, skipping")
                    continue

                model = make_pipeline(
                    StandardScaler(),
                    svm.SVC(kernel=kernel, C=C_svm, probability=True, random_state=42),
                )
                model.fit(X_inner.iloc[:, selected_features], y_inner.values.ravel())
                y_proba_val = model.predict_proba(X_val.iloc[:, selected_features])[
                    :, 1
                ]

                gain = compute_scaled_gain(
                    y_val, y_proba_val, num_features=num_selected
                )
                print(f"  Fold {index}: features = {num_selected}, gain = {gain:.2f}")
                total_gain += gain

            avg_gain = total_gain / skf.get_n_splits()
            print(
                f"Average gain for kernel={kernel}, C_feat={C_feat}, C_svm={C_svm}: {avg_gain:.2f}"
            )

            results.append((kernel, C_feat, C_svm, avg_gain))

            if avg_gain > best_gain:
                best_gain = avg_gain
                best_config = (kernel, C_feat, C_svm)

best_kernel, best_C_feat, best_C_svm = best_config
print(
    f"\nBest configuration: kernel={best_kernel}, C_feat={best_C_feat}, C_svm={best_C_svm} with gain: {best_gain:.2f}"
)


Trying kernel=rbf, C_feat = 0.0005, C_svm = 0.01
  Fold 1: features = 1, gain = 6917.65
  Fold 2: features = 1, gain = 7211.76
  Fold 3: features = 1, gain = 6858.82
  Fold 4: features = 1, gain = 7094.12
  Fold 5: features = 1, gain = 7564.71
Average gain for kernel=rbf, C_feat=0.0005, C_svm=0.01: 7129.41

Trying kernel=rbf, C_feat = 0.0005, C_svm = 0.1
  Fold 1: features = 1, gain = 7329.41
  Fold 2: features = 1, gain = 7211.76
  Fold 3: features = 1, gain = 6800.00
  Fold 4: features = 1, gain = 6917.65
  Fold 5: features = 1, gain = 7094.12
Average gain for kernel=rbf, C_feat=0.0005, C_svm=0.1: 7070.59

Trying kernel=rbf, C_feat = 0.0005, C_svm = 1.0
  Fold 1: features = 1, gain = 7329.41
  Fold 2: features = 1, gain = 7682.35
  Fold 3: features = 1, gain = 6858.82
  Fold 4: features = 1, gain = 6917.65
  Fold 5: features = 1, gain = 7447.06
Average gain for kernel=rbf, C_feat=0.0005, C_svm=1.0: 7247.06

Trying kernel=rbf, C_feat = 0.0005, C_svm = 10.0
  Fold 1: features = 1, gai

### Evaluation on the X_test, Y_test

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score


best_kernel, best_C_feat, best_C_svm = best_config

final_selected = select_features_L1(X_train.values, y_train.values.ravel(), best_C_feat)

final_model = make_pipeline(
    StandardScaler(),
    svm.SVC(kernel=best_kernel, C=best_C_svm, probability=True, random_state=42),
)
final_model.fit(X_train.iloc[:, final_selected], y_train.values.ravel())

y_proba_test = final_model.predict_proba(X_test.iloc[:, final_selected])[:, 1]
y_pred_test = final_model.predict(X_test.iloc[:, final_selected])

scaled_gain_test = compute_scaled_gain(
    y_test, y_proba_test, num_features=len(final_selected)
)
acc = accuracy_score(y_test, y_pred_test)
prec = precision_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print(f"Final Scaled Gain on Test Set: {scaled_gain_test:.2f}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")

Final Scaled Gain on Test Set: 7066.67
Accuracy: 0.7213
Precision: 0.7105
F1 Score: 0.7172


## Second algorithm : Mutual Information + SVM

In [21]:
from sklearn.feature_selection import mutual_info_classif
import numpy as np


def select_features_MI(X_train, y_train, threshold):
    mi_scores = mutual_info_classif(
        X_train, y_train, discrete_features=False, random_state=0
    )
    selected_indices = [i for i, score in enumerate(mi_scores) if score > threshold]
    return selected_indices

### grid search implementation

In [22]:
import numpy as np
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif


def compute_scaled_gain(
    y_true,
    y_pred_proba,
    num_features,
    gain_per_tp=10,
    selection_ratio=0.2,
    max_reward=10000,
    feature_cost=200,
):
    n_selected = int(selection_ratio * len(y_true))
    top_k_indices = np.argsort(y_pred_proba)[-n_selected:]
    true_positives = np.array(y_true)[top_k_indices].sum()
    reward = (true_positives * gain_per_tp * max_reward) / (n_selected * gain_per_tp)
    cost = num_features * feature_cost
    return reward - cost


mi_thresholds = [0.06, 0.07, 0.08]
C_svm_grid = [0.01, 0.1, 1.0, 10.0]
kernel_options = ["rbf", "poly"]


best_config = None
best_gain = -np.inf
results = []

for kernel in kernel_options:
    for C_feat in mi_thresholds:
        for C_svm in C_svm_grid:
            print(f"\nTrying kernel={kernel}, MI threshold = {C_feat}, C_svm = {C_svm}")
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            total_gain = 0

            for index, (train_idx, val_idx) in enumerate(
                skf.split(X_train, y_train), 1
            ):
                X_inner, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_inner, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

                selected_features = select_features_MI(
                    X_inner.values, y_inner.values.ravel(), C_feat
                )
                num_selected = len(selected_features)

                if num_selected == 0:
                    print(f"  Fold {index}: no features selected, skipping")
                    continue

                model = make_pipeline(
                    StandardScaler(),
                    svm.SVC(kernel=kernel, C=C_svm, probability=True, random_state=42),
                )
                model.fit(X_inner.iloc[:, selected_features], y_inner.values.ravel())
                y_proba_val = model.predict_proba(X_val.iloc[:, selected_features])[
                    :, 1
                ]

                gain = compute_scaled_gain(
                    y_val, y_proba_val, num_features=num_selected
                )
                print(f"  Fold {index}: features = {num_selected}, gain = {gain:.2f}")
                total_gain += gain

            avg_gain = total_gain / skf.get_n_splits()
            print(
                f"Average gain for kernel={kernel}, MI threshold={C_feat}, C_svm={C_svm}: {avg_gain:.2f}"
            )

            results.append((kernel, C_feat, C_svm, avg_gain))

            if avg_gain > best_gain:
                best_gain = avg_gain
                best_config = (kernel, C_feat, C_svm)

best_kernel, best_C_feat, best_C_svm = best_config
print(
    f"\nBest configuration: kernel={best_kernel}, MI threshold={best_C_feat}, C_svm={best_C_svm} with gain: {best_gain:.2f}"
)


Trying kernel=rbf, MI threshold = 0.06, C_svm = 0.01
  Fold 1: features = 4, gain = 6435.29
  Fold 2: features = 3, gain = 7223.53
  Fold 3: features = 4, gain = 6435.29
  Fold 4: features = 4, gain = 6494.12
  Fold 5: features = 4, gain = 6670.59
Average gain for kernel=rbf, MI threshold=0.06, C_svm=0.01: 6651.76

Trying kernel=rbf, MI threshold = 0.06, C_svm = 0.1
  Fold 1: features = 4, gain = 6435.29
  Fold 2: features = 3, gain = 6870.59
  Fold 3: features = 4, gain = 6376.47
  Fold 4: features = 4, gain = 6611.76
  Fold 5: features = 4, gain = 6788.24
Average gain for kernel=rbf, MI threshold=0.06, C_svm=0.1: 6616.47

Trying kernel=rbf, MI threshold = 0.06, C_svm = 1.0
  Fold 1: features = 4, gain = 6670.59
  Fold 2: features = 3, gain = 6870.59
  Fold 3: features = 4, gain = 6317.65
  Fold 4: features = 4, gain = 6552.94
  Fold 5: features = 4, gain = 6552.94
Average gain for kernel=rbf, MI threshold=0.06, C_svm=1.0: 6592.94

Trying kernel=rbf, MI threshold = 0.06, C_svm = 10.0

### Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score

best_kernel, best_C_feat, best_C_svm = best_config


final_selected = select_features_MI(X_train.values, y_train.values.ravel(), best_C_feat)

final_model = make_pipeline(
    StandardScaler(),
    svm.SVC(kernel=best_kernel, C=best_C_svm, probability=True, random_state=42),
)
final_model.fit(X_train.iloc[:, final_selected], y_train.values.ravel())

y_proba_test = final_model.predict_proba(X_test.iloc[:, final_selected])[:, 1]
y_pred_test = final_model.predict(X_test.iloc[:, final_selected])

scaled_gain_test = compute_scaled_gain(
    y_test, y_proba_test, num_features=len(final_selected)
)
acc = accuracy_score(y_test, y_pred_test)
prec = precision_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print(f"Final Scaled Gain on Test Set: {scaled_gain_test:.2f}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"F1 Score: {f1:.4f}")

Final Scaled Gain on Test Set: 6866.67
Accuracy: 0.7320
Precision: 0.7188
F1 Score: 0.7295


## Third algorithm : Mutual Information + Random Forest

In [None]:
from sklearn.feature_selection import mutual_info_classif
import numpy as np


def select_features_MI(X, y, threshold=0.01):
    mi_scores = mutual_info_classif(
        X, y, discrete_features=False, random_state=0, n_neighbors=5
    )
    return [i for i, score in enumerate(mi_scores) if score > threshold]

### Grid-search implementation

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler


def select_features_MI(X, y, threshold=0.01):
    mi_scores = mutual_info_classif(
        X, y, discrete_features=False, random_state=0, n_neighbors=5
    )
    return [i for i, score in enumerate(mi_scores) if score > threshold]


mi_thresholds = [0.06, 0.07, 0.08]
rf_n_estimators = [50, 100]
rf_max_depths = [5, None]

best_gain = -np.inf
best_config = None
results = []

for threshold in mi_thresholds:
    for n_estimators in rf_n_estimators:
        for max_depth in rf_max_depths:
            print(
                f"\nTrying MI threshold={threshold}, n_estimators={n_estimators}, max_depth={max_depth}"
            )
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            total_gain = 0

            for fold_idx, (train_idx, val_idx) in enumerate(
                skf.split(X_train, y_train), 1
            ):
                X_inner, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_inner, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

                selected_features = select_features_MI(
                    X_inner.values, y_inner.values.ravel(), threshold
                )
                num_selected = len(selected_features)

                if num_selected == 0:
                    print(f"  Fold {fold_idx}: no features selected, skipping")
                    continue

                model = make_pipeline(
                    StandardScaler(),
                    RandomForestClassifier(
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        random_state=42,
                        n_jobs=-1,
                    ),
                )
                model.fit(X_inner.iloc[:, selected_features], y_inner.values.ravel())
                y_proba_val = model.predict_proba(X_val.iloc[:, selected_features])[
                    :, 1
                ]

                gain = compute_scaled_gain(
                    y_val, y_proba_val, num_features=num_selected
                )
                print(
                    f"  Fold {fold_idx}: features = {num_selected}, gain = {gain:.2f}"
                )
                total_gain += gain

            avg_gain = total_gain / skf.get_n_splits()
            print(
                f"Average gain for threshold={threshold}, n_estimators={n_estimators}, max_depth={max_depth}: {avg_gain:.2f}"
            )

            results.append((threshold, n_estimators, max_depth, avg_gain))

            if avg_gain > best_gain:
                best_gain = avg_gain
                best_config = (threshold, n_estimators, max_depth)

print(
    f"\nBest configuration: threshold={best_config[0]}, n_estimators={best_config[1]}, max_depth={best_config[2]} with gain: {best_gain:.2f}"
)


Trying MI threshold=0.06, n_estimators=50, max_depth=5
  Fold 1: features = 6, gain = 5800.00
  Fold 2: features = 5, gain = 6588.24
  Fold 3: features = 6, gain = 6211.76
  Fold 4: features = 4, gain = 6494.12
  Fold 5: features = 5, gain = 6470.59
Average gain for threshold=0.06, n_estimators=50, max_depth=5: 6312.94

Trying MI threshold=0.06, n_estimators=50, max_depth=None
  Fold 1: features = 6, gain = 5564.71
  Fold 2: features = 5, gain = 6176.47
  Fold 3: features = 6, gain = 5976.47
  Fold 4: features = 4, gain = 6552.94
  Fold 5: features = 5, gain = 6470.59
Average gain for threshold=0.06, n_estimators=50, max_depth=None: 6148.24

Trying MI threshold=0.06, n_estimators=100, max_depth=5
  Fold 1: features = 6, gain = 5976.47
  Fold 2: features = 5, gain = 6352.94
  Fold 3: features = 6, gain = 6094.12
  Fold 4: features = 4, gain = 6494.12
  Fold 5: features = 5, gain = 6470.59
Average gain for threshold=0.06, n_estimators=100, max_depth=5: 6277.65

Trying MI threshold=0.06,

### Evaluation

In [15]:
from sklearn.metrics import accuracy_score, precision_score, f1_score

threshold, n_estimators, max_depth = best_config

selected_features = select_features_MI(
    X_train.values, y_train.values.ravel(), threshold
)

model = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1,
    ),
)
model.fit(X_train.iloc[:, selected_features], y_train.values.ravel())

y_proba_test = model.predict_proba(X_test.iloc[:, selected_features])[:, 1]
y_pred_test = model.predict(X_test.iloc[:, selected_features])

test_gain = compute_scaled_gain(
    y_test, y_proba_test, num_features=len(selected_features)
)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print(f"Test set scaled gain: {test_gain:.2f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")

Test set scaled gain: 7133.33
Accuracy: 0.7253
Precision: 0.7128
F1 Score: 0.7224
