# Import des outils / jeu de données

In [35]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTENC
from keras import layers
from lightgbm import LGBMClassifier
from scipy.stats import boxcox
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
)
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from tensorflow import keras
from sklearn.metrics import recall_score

In [2]:
SEED = 0

In [3]:
np.random.seed(SEED)
sns.set_theme()

In [23]:
df = pd.read_csv(
    "data/train.csv",
    index_col="id",
)

## Variables globales

In [24]:
TARGET = "prognosis"
LABELS = df[TARGET].unique()

In [26]:
X = df.copy()
y = X.pop(TARGET)

In [41]:
le = LabelEncoder()
y = le.fit_transform(y)

In [27]:
var_numeriques = []

In [28]:
var_categoriques = X.columns

## Fonctions et variables utiles

In [11]:
score_modeles = []

**Tableau.** Informations sur notre classification

|:---------------------------|:----------------------------------|
| **Objectif métier**        | Prédire l'acceptation à une campagne marketing |
| **Problème technique**     | Classification binaire supervisée |
| **Métrique**               | Score F1 sur la classe 1 (clients qui acceptent)<br>À score F1 égal, on choisit la meilleure précision sur la classe 1 |
| **Méthode d'entraînement** | Validation croisée en 5 blocs     |
| **Pré-traitement**         | Variables quantitatives : centrer/réduire<br>Variables qualitatives : OneHot Encoding (Tableau Disjonctif Complet) |
| **Équilibrage des classes**| 1) Aucun<br>2) Sous-échantillonnage aléatoire manuel<br>3) Sur-échantillonnage avec SMOTE |


In [12]:
def evaluate_models(models, prefix, X_train, X_test, y_train, y_test):
    """Evalue tous les modèles dans `models` et sauvegarde les résultats avec un préfixe `prefix`
    (utile pour distinguer les différentes stratégies de pré-traitement des données)."""
    results = []

    for model, model_name in models:
        name = f"{prefix}/{model_name}"

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        precision = accuracy_score(y_test, y_pred)
        scores = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring=make_scorer(f1_score, labels=[LABELS[1]]),
        )
        scores_mean = scores.mean()
        scores_std = scores.std()

        # clf_report = pd.DataFrame(
        #     classification_report(y_test, y_pred, output_dict=True)
        # ).T
        # cm = confusion_matrix(y_test, y_pred, labels=LABELS, normalize="true")
        # # sns.heatmap(cm, annot=True, cmap="Purples", vmin=0, vmax=1)
        #
        # score_f1_classe1 = clf_report.iloc[1, 2]

        results.append(
            [
                name,
                scores_mean,
                #             scores_std,
            ]
        )
        score_modeles.extend(
            (
                [
                    name,
                    "score_f1_classe1",
                    scores_mean,
                    #               scores_std,
                    precision,
                ],
            )
        )

    return results

## MAP@K

In [152]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [153]:
def map3_score(y_true, y_pred):
    """ Renvoie le score MAP@3 des données prédites y_pred par rapport aux vraies données y_true. """
    sorted_prediction_ids = np.argsort(-y_pred, axis=1)
    top_3_prediction_ids = sorted_prediction_ids[:, :3]

    return mapk(y_true.reshape(-1, 1), top_3_prediction_ids, k=3)

In [154]:
def top3_predictions(model, X, label_encoder=le):
    y_pred = model.predict_proba(X)

    sorted_prediction_ids = np.argsort(-y_pred, axis=1)
    top_3_prediction_ids = sorted_prediction_ids[:, :3]

    original_shape = top_3_prediction_ids.shape
    top_3_predictions_array = label_encoder.inverse_transform(top_3_prediction_ids.reshape(-1, 1))
    top_3_predictions_array = top_3_predictions_array.reshape(original_shape)

    return top_3_predictions_array

# Liste des modèles

**Tableau.** Liste des modèles de notre étude

|:---------------------------|:----------------------------------|
| **Modèles de référence**   | Classificateur Idiot Uniforme (50% de oui et 50% de non)<br>Classificateur Idiot Constant 1 (100% de oui) |
| **Modèles linéaires**  | Régression logistique<br>Analyse Discriminante Linéaire |
| **Arbres de décision**               | Arbre de décision<br>Forêt d'arbres de décision (Random Forest) |
| **Gradient Boosting** | XGBoost<br>LightGBM<br>CatBoost |
| **Machine à vecteurs de support (SVM)** | Classificateur SVM linéaire |
| **k plus proches voisins (k-NN)** | Classificateur k-nn (5 voisins)<br>Classificateur k-nn (15 voisins) |
| **Modèle de vote**| Modèle de "Vote à la majorité" sur 5 modèles :<br>- Régression logistique<br>- Analyse discriminante linéaire<br>- Random Forest<br>- XGBoost<br>- CatBoost |
| **Réseau de neurones**| Réseau de neurones à 5 couches et 1 600 neurones |


In [14]:
LABEL_DUMMY_CONSTANT = LABELS[0]

In [15]:
models = [
    [DummyClassifier(strategy="uniform", random_state=SEED), "DummyClassifier_Uniform"],
    [
        DummyClassifier(strategy="constant", constant=LABEL_DUMMY_CONSTANT, random_state=SEED),
        f"DummyClassifier_Constant_{LABEL_DUMMY_CONSTANT}",
    ],
    [LogisticRegression(random_state=SEED), "LogisticRegression"],
    [LinearDiscriminantAnalysis(), "LinearDiscriminantAnalysis"],
    [DecisionTreeClassifier(random_state=SEED), "DecisionTreeClassifier"],
    [RandomForestClassifier(random_state=SEED), "RandomForestClassifier"],
    [xgboost.XGBClassifier(random_state=SEED), "XGBClassifier"],
    [CatBoostClassifier(random_state=SEED, verbose=False), "CatBoostClassifier"],
    [LGBMClassifier(random_state=SEED), "LGBMClassifier"],
    [LinearSVC(random_state=SEED), "LinearSVC"],
    # [BernoulliNB(), "BernoulliNB"],
    # [ComplementNB(), "ComplementNB"],
    [KNeighborsClassifier(), "KNeighborsClassifier5"],
    [KNeighborsClassifier(n_neighbors=15), "KNeighborsClassifier15"],
    [
        VotingClassifier(
            estimators=[
                ("lr", LogisticRegression(random_state=SEED)),
                ("lda", LinearDiscriminantAnalysis()),
                ("dt", RandomForestClassifier(random_state=SEED)),
                ("xgb", xgboost.XGBClassifier(random_state=SEED)),
                ("catboost", CatBoostClassifier(random_state=SEED, verbose=False)),
            ],
            voting="soft",
        ),
        "VotingClassifier",
    ],
]

# Traitement des données

## Pipeline Scaler & OneHotEncoding

In [20]:
var_cat_non_ohe = X.columns

In [21]:
var_cat_ohe = []

In [29]:
df[var_categoriques].head()

Unnamed: 0_level_0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,lymph_swells,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
3,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0


In [30]:
preprocessor = ColumnTransformer(
    remainder="passthrough",
    transformers=[
        ("ohe", OneHotEncoder(), var_cat_ohe),
        ("scaler", RobustScaler(), var_numeriques),
    ],
)

In [32]:
nouveau_df = pd.DataFrame(preprocessor.transform(X), index=df.index)

In [33]:
nouveau_df.shape

(707, 64)

## Par défaut

In [99]:
X_train, X_test, y_train, y_test = train_test_split(
    nouveau_df, y, test_size=0.2, random_state=SEED
)

In [100]:
y_train

array([ 2,  7,  6,  7,  6,  1,  7,  4,  0,  9,  8,  3,  9,  8,  4,  2,  0,
        5,  1,  6,  6,  0,  8,  3,  0,  8,  1, 10,  8, 10,  6,  0,  6, 10,
        6,  2,  2,  9,  0,  6,  4,  9,  7,  0,  8,  8,  0,  0, 10,  6,  4,
        6,  1,  1,  8,  7,  8,  6,  2,  6,  6,  1,  8,  0,  9,  9,  8,  0,
        7,  2,  3,  9,  4,  8,  1,  7,  8,  2,  7,  8,  9,  4,  9,  6,  8,
        7, 10,  6,  1,  5,  0,  4,  2,  0,  6,  9,  2,  9,  6,  0,  4,  8,
        8,  7,  0,  4,  2,  3, 10,  0,  6,  1,  5,  4,  2,  7,  8,  6,  7,
        2,  2,  4, 10,  5,  9,  3,  8,  6,  1,  8,  2, 10,  5,  6,  2,  3,
        9,  8,  4,  9,  9,  1,  0,  5,  5,  5,  2,  9, 10,  7,  9,  3,  2,
        8,  8,  1,  8,  2,  8,  9,  7,  2,  7, 10,  5,  0,  7,  2, 10,  8,
        5,  8,  9,  6,  1,  1,  4,  8,  3,  6,  6,  4, 10,  9,  9,  0,  3,
        2,  8, 10,  8,  3,  8,  2,  8,  7,  9,  8,  7,  3,  0,  4,  3,  2,
        0,  0,  7,  5,  6,  6,  2,  5,  9, 10,  1,  3,  4,  5,  7,  2,  8,
        8,  0,  2,  0,  8

In [101]:
nouveau_df.shape

(707, 64)

In [102]:
xgb = xgboost.XGBClassifier()

In [103]:
xgb.fit(X_train, y_train)

In [131]:
rf = CatBoostClassifier(random_state=SEED, verbose=False)
rf.fit(X_train, y_train)

Learning rate set to 0.076856
0:	learn: 2.3627971	total: 63.3ms	remaining: 1m 3s
1:	learn: 2.3275472	total: 75.4ms	remaining: 37.6s
2:	learn: 2.2899332	total: 85.3ms	remaining: 28.3s
3:	learn: 2.2553277	total: 94.6ms	remaining: 23.6s
4:	learn: 2.2192184	total: 104ms	remaining: 20.8s
5:	learn: 2.1872529	total: 113ms	remaining: 18.7s
6:	learn: 2.1631502	total: 123ms	remaining: 17.4s
7:	learn: 2.1400406	total: 131ms	remaining: 16.2s
8:	learn: 2.1172277	total: 138ms	remaining: 15.2s
9:	learn: 2.0906811	total: 145ms	remaining: 14.4s
10:	learn: 2.0651955	total: 153ms	remaining: 13.7s
11:	learn: 2.0351482	total: 160ms	remaining: 13.2s
12:	learn: 2.0096160	total: 168ms	remaining: 12.8s
13:	learn: 1.9878753	total: 175ms	remaining: 12.3s
14:	learn: 1.9649497	total: 183ms	remaining: 12s
15:	learn: 1.9406329	total: 190ms	remaining: 11.7s
16:	learn: 1.9249374	total: 198ms	remaining: 11.4s
17:	learn: 1.9044035	total: 206ms	remaining: 11.2s
18:	learn: 1.8865688	total: 213ms	remaining: 11s
19:	learn: 

<catboost.core.CatBoostClassifier at 0x7fcf02656ad0>

In [156]:
y_pred = rf.predict_proba(X_test)

In [157]:
top3_predictions(rf, X_train)

  y = column_or_1d(y, warn=True)


array([['Japanese_encephalitis', 'Tungiasis', 'Rift_Valley_fever'],
       ['Tungiasis', 'West_Nile_fever', 'Rift_Valley_fever'],
       ['Rift_Valley_fever', 'West_Nile_fever', 'Plague'],
       ...,
       ['Plague', 'Yellow_Fever', 'Zika'],
       ['Yellow_Fever', 'Japanese_encephalitis', 'Zika'],
       ['West_Nile_fever', 'Lyme_disease', 'Plague']], dtype=object)

In [158]:
map3_score(y_test, y_pred)

0.35563380281690143

In [51]:
prefix = "défaut"
results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)

Traceback (most recent call last):
  File "/home/ab2/.cache/pypoetry/virtualenvs/kaggle-competitions-ww632Mhq-py3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/ab2/.cache/pypoetry/virtualenvs/kaggle-competitions-ww632Mhq-py3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/ab2/.cache/pypoetry/virtualenvs/kaggle-competitions-ww632Mhq-py3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1146, in f1_score
    return fbeta_score(
  File "/home/ab2/.cache/pypoetry/virtualenvs/kaggle-competitions-ww632Mhq-py3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1287, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/home/ab2/.cache/pypoetry/virtualenvs/kaggle-competitions-ww632Mhq-py3.10/li

ValueError: The constant target value must be present in the training data. You provided constant=Lyme_disease. Possible values are: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10].

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

## Équilibrage des classes

### Under-sampling (manuel)

In [None]:
# samples0 = df[df["Response"] == 0].sample(350, random_state=SEED)

In [None]:
# X_eq = pd.concat((samples0, df[df["Response"] == 1]))

In [None]:
# X_eq["Response"].hist()

In [None]:
# y_eq = X_eq.pop("Response").astype(int)

In [None]:
# X_eq = pd.get_dummies(X_eq.drop(columns=["Dt_Customer"]))

In [None]:
# df["Response"].value_counts()

In [None]:
NB_A_SUPPRIMER = 1400
drop_indices = np.random.choice(
    nouveau_df[y["Response"] == 0].index, NB_A_SUPPRIMER, replace=False
)
df_subset = nouveau_df.drop(drop_indices)

In [None]:
y_eq = y.drop(index=drop_indices)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_subset, y_eq, test_size=0.2, random_state=SEED
)

In [None]:
prefix = "éq_classes"
results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

### Over-sampling (SMOTE)

In [None]:
cat_cols = list(X.select_dtypes(include=["category", "int", "bool"]).columns)

In [None]:
cat_cols_index = list(map(lambda c: list(X.columns).index(c), cat_cols))

In [None]:
list(X.iloc[:, cat_cols_index].columns)

In [None]:
sm = SMOTENC(
    categorical_features=cat_cols_index,
    random_state=SEED,
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    nouveau_df, y, test_size=0.2, random_state=SEED
)

In [None]:
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [None]:
y_train_sm.value_counts(normalize=True)

In [None]:
prefix = "SMOTE"
results = evaluate_models(models, prefix, X_train_sm, X_test, y_train_sm, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

# Réseau de neurones

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_subset, y_eq, test_size=0.2, random_state=SEED
)

In [None]:
X_train = np.asarray(X_train).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_test = np.asarray(X_test).astype("float32")
y_test = np.asarray(y_test).astype("float32")

In [None]:
X_train.shape

In [None]:
np.random.seed(0)
model = keras.Sequential(
    [
        layers.Dense(400, activation="relu", input_shape=[X_train.shape[1]]),
        layers.Dense(400, activation="relu"),
        layers.Dense(400, activation="sigmoid"),
        layers.Dense(400, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ]
)

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"],  # "binary_accuracy"
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    # validation_split=0.2,
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0,  # hide the output because we have so many epochs
)

In [None]:
history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ["loss", "val_loss"]].plot()
history_df.loc[5:, ["binary_accuracy", "val_binary_accuracy"]].plot()

print(
    ("Best Validation Loss: {:0.4f}" + "\nBest Validation Accuracy: {:0.4f}").format(
        history_df["val_loss"].min(), history_df["val_binary_accuracy"].max()
    )
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
sns.histplot(y_pred > 0.5, discrete=True)

In [None]:
y_pred_old = y_pred

In [None]:
y_pred = y_pred > 0.5

In [None]:
print(classification_report(y_test, y_pred, labels=LABELS))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=LABELS)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=LABELS)
disp.plot()

In [None]:
nom_modele = "Réseau de Neurones"
# ajout_score(model, nom_modele, y_test, y_pred)

# Diagnostic du modèle

## RandomForest feature importance

In [None]:
nom_colonnes = preprocessor.get_feature_names_out(X.columns)

In [None]:
nom_colonnes = list(map(lambda x: x.split("__")[1], nom_colonnes))

In [None]:
rf = models[6][0]

In [None]:
fi = rf.feature_importances_

In [None]:
fi = pd.DataFrame(fi.reshape((1, len(fi))), columns=nom_colonnes)

In [None]:
fi = fi.sort_values(
    by=0, axis=1, ascending=False
)  # trier les colonnes en fonction de la ligne 0

In [None]:
plt.figure(figsize=(5, 12))
plt.title("Importance donnée par le modèle RandomForest")
sns.barplot(fi, orient="h", color="gray")

## Permutation importance

In [None]:
result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=SEED)

In [None]:
# sns.histplot(result.importances_std < 0.015)

In [None]:
pi_results = result.importances_mean

In [None]:
pi_results = pd.DataFrame(
    pi_results.reshape((1, len(pi_results))), columns=nom_colonnes
)
pi_results = pi_results.sort_values(by=0, axis=1, ascending=False)

In [None]:
plt.figure(figsize=(5, 12))
plt.title("Importance de Permutation du modèle RandomForest")
sns.barplot(pi_results, orient="h", color="gray")

# Pour aller plus loin

- afficher les intervalles de confiance des scores de validation croisée
- optimiser les hyper-paramètres des modèles (avec une recherche en grille ou une recherche bayésienne)
- tester les modèles sur différents sous-ensembles de variables pour les comparer

# Sauvegarde des données

In [None]:
score_modeles_df = pd.DataFrame(
    score_modeles, columns=["Modèle", "Métrique", "Valeur", "Précision"]
)

In [None]:
score_modeles_df.to_csv("data/results/classifications.csv", index=False)