# Import des outils / jeu de données

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost
from sklearn.ensemble import IsolationForest

from constants import var_categoriques, var_numeriques

In [None]:
np.random.seed(0)
sns.set_theme()

In [None]:
df = pd.read_csv(
    "../data/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

In [None]:
df_transforme = pd.read_csv(
    "../data/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
LABELS = (0, 1)

# Isolation Forest (détection d'outliers)

In [None]:
X = pd.get_dummies(df.drop(columns=["Response", "Dt_Customer"]))
y = df[["Response"]].astype(int)

In [None]:
iforest = IsolationForest(random_state=0)

In [None]:
iforest.fit(X)

In [None]:
X.head()

In [None]:
sns.histplot(iforest.predict(X))

In [None]:
X["outlier"] = iforest.predict(X)

In [None]:
plt.title("Outliers (-1) vs Normaux (1)")
sns.histplot(data=X, hue="outlier", x="Income", bins=30, kde=True)

In [None]:
sns.histplot(data=X[X["outlier"] == 1], x="Income", bins=30, kde=True)

# Optimisation des hyper-paramètres

In [None]:
# todo

In [None]:
# params = {
#     "max_depth": [3, 6, 10],
#     "learning_rate": [0.01, 0.05, 0.1],
#     "n_estimators": [100, 500, 1000],
#     "colsample_bytree": [0.3, 0.7],
# }
#
# clf = GridSearchCV(
#     estimator=model,
#     param_grid=params,
#     scoring="precision",
#     verbose=1,
# )
#
# clf.fit(X_train, y_train)

# Mutual Information

### Sans OneHotEncoding

In [None]:
# Label encoding for categoricals
for colname in df.select_dtypes(["object", "category", "bool"]):
    df[colname], _ = df[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = df.dtypes == int

In [None]:
discrete_features.drop("Response", axis=0, inplace=True)

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(
        X, y, discrete_features=discrete_features, random_state=SEED
    )
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
mi_scores = make_mi_scores(df.drop(columns=["Response"]), y, discrete_features)

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(figsize=(5, 12))
plot_mi_scores(mi_scores)

### Avec OneHotEncoding

In [None]:
# Label encoding for categoricals
for colname in X.select_dtypes(["object", "category", "bool"]):
    X[colname], _ = X[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [None]:
mi_scores = make_mi_scores(X, y, discrete_features)

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(figsize=(5, 12))
plot_mi_scores(mi_scores)

### Modèles après MI (avec OneHotEncoding)

In [None]:
positive_mi = mi_scores > 0

In [None]:
cols_to_drop = positive_mi[positive_mi == 0].index

In [None]:
X_positive_mi = X_eq.drop(columns=cols_to_drop)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_positive_mi, y_eq, test_size=0.2, random_state=SEED
)

In [None]:
prefix = "positive_mi"
results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)