# Import des outils / jeu de données

In [None]:
import statistics

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats.api as sms
import xgboost
from keras import layers
from scipy import stats
from scipy.stats import kstest, pearsonr, poisson
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, PoissonRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OrdinalEncoder, PolynomialFeatures, RobustScaler
from statsmodels.compat import lzip
from statsmodels.graphics.regressionplots import *
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tensorflow import keras

from functions import affiche_score

In [None]:
np.random.seed(0)
sns.set_theme()

In [None]:
df = pd.read_csv(
    # "data/data-cleaned-feature-engineering.csv",
    "data/data-cleaned.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

In [None]:
df_transforme = pd.read_csv(
    "data/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
var_numeriques = [
    "Year_Birth",
    "Income",
    "Recency",
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
]

In [None]:
var_categoriques = [
    "Education",
    "Marital_Status",
    "Kidhome",
    "Teenhome",
    "AcceptedCmp1",
    "AcceptedCmp2",
    "AcceptedCmp3",
    "AcceptedCmp4",
    "AcceptedCmp5",
    "Response",
]

In [None]:
df[var_categoriques] = df[var_categoriques].astype(str).astype("category")

In [None]:
df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], format="%Y-%m-%d").astype(int)

## Fonctions et variables utiles

In [None]:
score_modeles = []

In [None]:
def ajout_score(modele, nom_modele, y_test, y_pred):
    """Ajoute la MMSE, RMSE et MAE au dataframe score_modeles."""
    score_modeles.extend(
        (
            [nom_modele, "mse", mean_squared_error(y_test, y_pred)],
            [nom_modele, "rmse", mean_squared_error(y_test, y_pred, squared=False)],
            [nom_modele, "mae", mean_absolute_error(y_test, y_pred)],
        )
    )

## Préparation du jeu de données

## Normalisation

### Explication

Pour normaliser les données, nous allons utiliser la transformation de Box-Cox, définie $\forall x > 0, $ comme ci-dessous :
$B(x, \lambda) = \begin{cases} \frac{x^{\lambda} - 1}{\lambda} & \text{  si } \lambda \neq 0 \\ \log(x) & \text{  si } \lambda = 0 \end{cases}$

Cette transformation est à appliquer à une variable (strictement positive), en ajustant le $\lambda$ pour maximiser la normalité.

Nous allons utiliser la librairie `scipy.stats.boxcox` qui estime le meilleur paramètre $\lambda$.

### Sélection des variables

In [None]:
df[var_numeriques].hist(figsize=(12, 12), bins=30)
plt.show()

In [None]:
var_a_normaliser = [
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
]

### Transformation

In [None]:
sns.histplot(df["MntWines"], kde=True)

In [None]:
boxcox_lambdas = {}  # on garde les lambdas, pour la transformation inverse

In [None]:
for var in var_a_normaliser:
    var_strict_positif = df[var] + df[var].min() + 1

    var_apres_boxcox, l = boxcox(var_strict_positif)

    df_transforme[var] = var_apres_boxcox
    boxcox_lambdas[var] = l

In [None]:
# Tuto : comment récupérer la fonction initiale
# (il faut avoir récupéré le paramètre "l" lambda)

# from scipy.special import inv_boxcox
# initial = inv_boxcox(incbox, l)
# initial = pd.DataFrame(initial)
# sns.histplot(initial, bins=50, kde=True)

### Fin de la normalisation (todo)

In [None]:
X = df.drop(columns=["NumStorePurchases"])

### Variables catégoriques

In [None]:
marital_status = pd.get_dummies(X["Marital_Status"], prefix="Marital_Status")

In [None]:
marital_status.head()

In [None]:
encoder_education = OrdinalEncoder(
    categories=[["Basic", "2n Cycle", "Graduation", "Master", "PhD"]]
)
education = pd.DataFrame(
    encoder_education.fit_transform(X[["Education"]]),
    index=marital_status.index,
    columns=["Education"],
)

In [None]:
cat_col = pd.concat((marital_status, education), axis=1)

In [None]:
X.drop(columns=["Marital_Status", "Education"], inplace=True)

In [None]:
X = pd.concat((X, cat_col), axis=1)

## Scaling

In [None]:
scaler = RobustScaler()
X_scale = scaler.fit_transform(X)

In [None]:
y = df[["NumStorePurchases"]].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scale, y, test_size=0.2, random_state=0
)

# XGBoost

In [None]:
tuned_xgb = xgboost.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    n_jobs=4,
    eval_metric="mae",
    random_state=0,
)

In [None]:
tuned_xgb.fit(X_train, y_train)

In [None]:
y_pred = tuned_xgb.predict(X_test)

In [None]:
affiche_score(tuned_xgb, y_test, y_pred)

In [None]:
tuned_xgb.feature_importances_

In [None]:
fi = tuned_xgb.feature_importances_

In [None]:
fi = pd.DataFrame(fi.reshape((1, fi.shape[0])), columns=X.columns)

In [None]:
fi = fi.sort_values(
    by=0, axis=1, ascending=False
)  # trier les colonnes en fonction de la ligne 0

In [None]:
fi.T

In [None]:
plt.figure(figsize=(5, 12))
sns.barplot(fi, orient="h", color="gray")

In [None]:
result = permutation_importance(tuned_xgb, X_test, y_test, n_repeats=3, random_state=0)

In [None]:
result.importances_mean, result.importances_std

In [None]:
pi_results = result.importances_mean

In [None]:
pi_results = pd.DataFrame(
    pi_results.reshape((1, pi_results.shape[0])), columns=X.columns
)
pi_results = pi_results.sort_values(by=0, axis=1, ascending=False)

In [None]:
pi_results.T

In [None]:
plt.figure(figsize=(5, 12))
sns.barplot(pi_results, orient="h", color="gray")

# Test X2

In [None]:
X2 = df[
    [
        "MntWines",
        "MntMeatProducts",
        "Income",
        "MntSweetProducts",
        "NumCatalogPurchases",
        "MntFruits",
        "Response",
    ]
]

In [None]:
scaler = RobustScaler()
X_scale = scaler.fit_transform(X2)

In [None]:
y = df[["NumStorePurchases"]].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scale, y, test_size=0.2, random_state=0
)

In [None]:
tuned_xgb = xgboost.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    n_jobs=4,
    random_state=0,
)

In [None]:
tuned_xgb.fit(X_train, y_train)

In [None]:
y_pred = tuned_xgb.predict(X_test)

In [None]:
affiche_score(tuned_xgb, y_test, y_pred)

In [None]:
# Using Grid Search to find the best parameters
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_features": ["auto"],
    "max_depth": [None, 3, 5, 8],
    "criterion": ["gini"],
    "min_samples_split": [2, 3, 4],
}

# Training RF Models with K-Fold of 5
rf_models = GridSearchCV(
    RandomForestClassifier(random_state=5), param_grid=param_grid, cv=5, verbose=1
)
rf_models.fit(X_train, y_train)

# Refactor

In [None]:
models = [
    [LinearRegression(), "Régression linéaire"],
    [PoissonRegressor(), "GLM Poisson"],
    [PLSRegression(), "Régression PLS"],
    [xgboost.XGBRegressor(), "XGBoost"],
    [
        xgboost.XGBRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            n_jobs=4,
            eval_metric="mae",
            early_stopping_rounds=20,
            random_state=0,
        ),
        "XGBoost optimisé",
    ],
]