### a) data exploration

In [14]:
import pandas as pd

x_text = "x_train.txt"
y_text = "y_train.txt"

x_clients = "x_test.txt"

df_variables = pd.read_csv(
    x_text,
    delimiter=" ",
    header=None,
)

df_labels = pd.read_csv(
    y_text,
    delimiter=" ",
    header=None,
)

df_variables_clients  = pd.read_csv(
    x_clients,
    delimiter=" ",
    header=None,
)

In [15]:
df_variables.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,20.454647,16.739345,39.811892,24.955468,27.088535,17.116793,25.166957,23.364508,17.223886,18.339175,...,13.235314,5.160379,29.194846,17.298314,6.414267,7.780568,6.84091,18.295197,10.014028,6.938318
1,16.175225,10.483281,27.471017,18.509824,19.045353,15.039082,21.354915,15.790575,13.912508,13.772518,...,13.355832,2.609716,8.624576,9.371632,11.789219,9.205471,15.204468,8.358906,8.529152,8.021473
2,10.577212,10.795115,24.621388,17.264747,14.22161,8.754692,18.399259,11.358798,15.43265,14.842153,...,15.179359,10.200144,12.645303,12.147416,8.899863,13.954543,12.356942,16.364696,3.817956,4.094035
3,26.299206,13.471215,51.725934,40.786947,26.052414,33.200702,51.01433,27.685009,33.107991,25.359457,...,7.693654,6.359187,5.760296,5.69958,9.895795,17.011648,12.031,14.637973,10.172737,10.525373
4,23.193955,20.037969,37.78029,28.983748,25.510508,15.970348,27.930757,20.707354,25.341768,27.118987,...,7.068407,7.180632,12.517752,9.026493,7.748172,10.363749,7.099588,12.467672,11.545619,9.0986


In [16]:
df_labels.value_counts()

0    2557
1    2443
dtype: int64

In [17]:
Gain_Max_Households = 10*1000
Max_Variables_Possibles = Gain_Max_Households // 200 
print("Max Variables Possible: ", Max_Variables_Possibles)

Max Variables Possible:  50


## First algorithm : Logistic regression features + SVM 

In [None]:

from sklearn.linear_model import LogisticRegression

import numpy as np



def select_features_L1(X_train, y_train, C_feat):
    model = LogisticRegression(
        penalty="l1", solver="liblinear", C=C_feat, random_state=0
    )
    model.fit(X_train, y_train)
    coefs = model.coef_.ravel()
    selected_indices = np.where(coefs != 0)[0]
    return selected_indices

### grid-search implementation

In [None]:
from sklearn import svm

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np




X = df_variables.values
y = df_labels.values.ravel()

C_feat_grid = [0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]
C_svm_grid = [0.01, 0.1, 1.0, 10.0]

best_config = None
best_gain = -np.inf

for C_feat in C_feat_grid:
    for C_svm in C_svm_grid:
        print(f"\nTrying C_feat = {C_feat}, C_svm = {C_svm}")
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        Average_Predicted_Gain = 0

        for index, (train_index, test_index) in enumerate(skf.split(X, y), 1):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            selected_features = select_features_L1(X_train, y_train, C_feat)
            num_selected = len(selected_features)

            if num_selected == 0:
                print(f"  Fold {index}: no features selected, skipping")
                continue

            model = make_pipeline(
                StandardScaler(),
                svm.SVC(kernel="linear", C=C_svm, probability=True, random_state=42),
            )
            model.fit(X_train[:, selected_features], y_train)
            y_pred = model.predict(X_test[:, selected_features])

            predicted_cash = accuracy_score(y_test, y_pred) * 10 * 1000
            Cost = 200 * num_selected
            predicted_gain = predicted_cash - Cost
            print(
                f"  Fold {index}: acc = {accuracy_score(y_test, y_pred):.2f}, "
                f"features = {num_selected}, gain = {predicted_gain:.2f}"
            )

            Average_Predicted_Gain += predicted_gain

        Average_Predicted_Gain /= skf.get_n_splits()
        print(
            f"Average gain for C_feat={C_feat}, C_svm={C_svm}: {Average_Predicted_Gain:.2f}"
        )

        if Average_Predicted_Gain > best_gain:
            best_gain = Average_Predicted_Gain
            best_config = (C_feat, C_svm)

print(
    f"\nBest configuration: C_feat={best_config[0]}, C_svm={best_config[1]} with gain: {best_gain:.2f}"
)


Trying C_feat = 0.0005, C_svm = 0.01
  Fold 1: acc = 0.70, features = 1, gain = 13760.00
  Fold 2: acc = 0.74, features = 1, gain = 14600.00
  Fold 3: acc = 0.70, features = 1, gain = 13840.00
  Fold 4: acc = 0.71, features = 1, gain = 14040.00
  Fold 5: acc = 0.71, features = 1, gain = 14000.00
  Fold 6: acc = 0.69, features = 1, gain = 13600.00
  Fold 7: acc = 0.73, features = 1, gain = 14400.00
  Fold 8: acc = 0.69, features = 1, gain = 13680.00
  Fold 9: acc = 0.68, features = 1, gain = 13360.00
  Fold 10: acc = 0.70, features = 1, gain = 13760.00
Average gain for C_feat=0.0005, C_svm=0.01: 13904.00

Trying C_feat = 0.0005, C_svm = 0.1
  Fold 1: acc = 0.70, features = 1, gain = 13720.00
  Fold 2: acc = 0.74, features = 1, gain = 14560.00
  Fold 3: acc = 0.70, features = 1, gain = 13760.00
  Fold 4: acc = 0.72, features = 1, gain = 14120.00
  Fold 5: acc = 0.72, features = 1, gain = 14120.00
  Fold 6: acc = 0.69, features = 1, gain = 13680.00
  Fold 7: acc = 0.72, features = 1, gai

## Second algorithm : Mutual Information + SVM

In [5]:
from sklearn.feature_selection import mutual_info_classif
import numpy as np


def select_features_MI(X_train, y_train, threshold):
    mi_scores = mutual_info_classif(
        X_train, y_train, discrete_features=False, random_state=0
    )

    selected_indices = [i for i, score in enumerate(mi_scores) if score > threshold]

    return selected_indices

### grid search implementation

In [None]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np


X = df_variables.values
y = df_labels.values.ravel()

C_feat_grid = [0.06,0.05,0.04,0.03]

C_svm_grid = [ 0.01, 0.1, 1.0, 10.0]

best_config = None
best_gain = -np.inf

from sklearn.feature_selection import mutual_info_classif
import numpy as np


def select_features_MI(X_train, y_train, threshold):
    mi_scores = mutual_info_classif(
        X_train, y_train, discrete_features=False, random_state=0
    )

    selected_indices = [i for i, score in enumerate(mi_scores) if score > threshold]

    return selected_indices


for C_feat in C_feat_grid:
    for C_svm in C_svm_grid:
        print(f"\nTrying C_feat = {C_feat}, C_svm = {C_svm}")
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        Average_Predicted_Gain = 0

        for index, (train_index, test_index) in enumerate(skf.split(X, y), 1):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            
            selected_features = select_features_MI(X_train, y_train, C_feat)
            
            num_selected = len(selected_features)

            if num_selected == 0:
                print(f"  Fold {index}: no features selected, skipping")
                continue

            model = make_pipeline(
                StandardScaler(),
                svm.SVC(kernel="linear", C=C_svm, probability=True, random_state=42),
            )
            model.fit(X_train[:, selected_features], y_train)
            y_pred = model.predict(X_test[:, selected_features])

            predicted_cash = accuracy_score(y_test, y_pred) * 10 * 1000
            Cost = 200 * num_selected
            predicted_gain = predicted_cash - Cost
            print(
                f"  Fold {index}: acc = {accuracy_score(y_test, y_pred):.2f}, "
                f"features = {num_selected}, gain = {predicted_gain:.2f}"
            )

            Average_Predicted_Gain += predicted_gain

        Average_Predicted_Gain /= skf.get_n_splits()
        print(
            f"Average gain for C_feat={C_feat}, C_svm={C_svm}: {Average_Predicted_Gain:.2f}"
        )

        if Average_Predicted_Gain > best_gain:
            best_gain = Average_Predicted_Gain
            best_config = (C_feat, C_svm)

print(
    f"\nBest configuration: C_feat={best_config[0]}, C_svm={best_config[1]} with gain: {best_gain:.2f}"
)


Trying C_feat = 0.06, C_svm = 0.01
  Fold 1: acc = 0.68, features = 4, gain = 12760.00
  Fold 2: acc = 0.75, features = 3, gain = 14360.00
  Fold 3: acc = 0.70, features = 6, gain = 12880.00
  Fold 4: acc = 0.70, features = 5, gain = 13000.00
  Fold 5: acc = 0.71, features = 4, gain = 13400.00
  Fold 6: acc = 0.68, features = 4, gain = 12800.00
  Fold 7: acc = 0.72, features = 5, gain = 13480.00
  Fold 8: acc = 0.69, features = 5, gain = 12760.00
  Fold 9: acc = 0.67, features = 5, gain = 12480.00
  Fold 10: acc = 0.70, features = 3, gain = 13320.00
Average gain for C_feat=0.06, C_svm=0.01: 13124.00

Trying C_feat = 0.06, C_svm = 0.1
  Fold 1: acc = 0.71, features = 4, gain = 13320.00
  Fold 2: acc = 0.74, features = 3, gain = 14240.00
  Fold 3: acc = 0.70, features = 6, gain = 12760.00
  Fold 4: acc = 0.72, features = 5, gain = 13320.00
  Fold 5: acc = 0.73, features = 4, gain = 13760.00
  Fold 6: acc = 0.71, features = 4, gain = 13400.00
  Fold 7: acc = 0.72, features = 5, gain = 13

## Third algorithm : Mutual Information + Random Forest

In [14]:
from sklearn.feature_selection import mutual_info_classif
import numpy as np


def select_features_MI(X_train, y_train, threshold=0.01):
    mi_scores = mutual_info_classif(
        X_train, y_train, discrete_features=False, random_state=0
    )

    selected_indices = [i for i, score in enumerate(mi_scores) if score > threshold]

    return selected_indices

### Grid-search implementation

## Fourth algorithm : Mutual Information + XGBoost

## Fifth algorithm : Random Forest Feature Importance + XGBoost

### Leaderboard part


In [None]:
selected_features_final = select_features_MI(X, y, threshold=0.01)
model.fit(X[:, selected_features_final], y)

X_clients = df_variables_clients.values
proba_preds = model.predict_proba(X_clients[:, selected_features_final])[:, 1]


top_1000_indices = np.argsort(proba_preds)[-1000:][::-1]
top_1000_clients = df_variables_clients.iloc[top_1000_indices]

print("Top 1000 clients selected.")

Top 1000 clients selected.


In [11]:
selected_features_final

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 24,
 28,
 34,
 39,
 45,
 48,
 75,
 90,
 94,
 100,
 121,
 135,
 136,
 171,
 226,
 238,
 242,
 245,
 263,
 287,
 313,
 314,
 318,
 323,
 356,
 376,
 386,
 387,
 405,
 409,
 413,
 414,
 416,
 425,
 435,
 442,
 443,
 444,
 448,
 460,
 462,
 471,
 484,
 493,
 495]

In [10]:
top_1000_indices

array([4755, 3866, 2792, 2084, 3780, 4354,  281, 3347, 2735, 4430, 1361,
       3789, 3447, 1020, 2822, 4674, 4068, 3509,  405, 2268, 1288, 4880,
       4008, 2656,   79, 3147, 4075, 3654, 4344, 2983, 4093, 3383, 3571,
       1622,  512, 2576, 3070, 4550, 1734, 3250, 3672,    2, 3375, 2803,
       4192,  384, 3449, 1868, 1866, 2216, 2689, 3954,  883, 1830, 3020,
       1057, 4829, 2908,  681, 3115, 3774, 3942, 4480, 1053, 2481, 4479,
       4123,  254, 3312, 4551, 1805, 2368, 4235, 2707, 3825, 2386, 4445,
       4290, 3451,  129, 4745, 2456, 4895,   96, 3784,  415, 2020, 1286,
       1711, 2243,  492, 3521,  280, 2612, 3693, 2424, 2997, 2031, 1474,
       1493, 3114, 4429, 4908,   80, 4856, 1094, 1756, 1165, 1606, 1170,
       1872, 3182,  109, 2985, 1826, 2321, 3444, 4470, 3483, 2700, 4109,
       3439, 1320,  207, 1137, 3833,  149, 4749, 4519, 3335, 1243,  868,
        549, 2431, 3283, 4009, 4619, 1428, 2292, 4754, 1233, 2353, 3925,
       4320, 2489, 3120,  524,  920, 1246, 1559, 17