In [19]:
# On affiche les graphiques dans le notebook en statique
%matplotlib inline

In [None]:
import os
import numpy as np
import pandas as pd
import cudf
import cuml
import gc
import cupy as cp
from cuml.pipeline import Pipeline
from cuml.preprocessing import StandardScaler
from cuml.preprocessing import SimpleImputer


# import seaborn as sns
import joblib
import lightgbm as lgb

import warnings


warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:

from src.modeling.p7_constantes import (
    DATA_CLEAN_DIR,
    DATA_INTERIM,
    VAL_SEED,
    MODEL_DIR
)
#from src.modeling.p7_util import timer, reduce_memory_cudf
from src.modeling.p7_preprocess import WithColNames
from src.modeling.p7_vif import VIFSelector

%load_ext autoreload
%autoreload 2
#autoreload 2 = Reload all modules (except those excluded by %aimport)
# every time before executing the Python code typed.

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
print("cuDF version:", cudf.__version__)
print("cuML version:", cuml.__version__)

cuDF version: 24.04.01
cuML version: 24.04.00


# Lecture des données de Train

In [32]:
train = cudf.read_csv(os.path.join(DATA_INTERIM, "01_v1_miss_200_train.csv"))
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 230634 entries, 0 to 230633
Columns: 555 entries, SK_ID_CURR to CC_CNT_DRAWINGS_CURRENT_VAR
dtypes: bool(158), float64(357), int64(40)
memory usage: 743.8 MB


In [33]:
X = train.drop(columns=["SK_ID_CURR", "TARGET"]).astype(np.float64)
y = train["TARGET"].astype(np.float64)

print("Info X :")
print(X.info())
print("\nInfo y :")
print("Classe", y.__class__.__name__)
print("Type", y.dtype)
print("Shape", y.shape)

Info X :
<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 230634 entries, 0 to 230633
Columns: 553 entries, CODE_GENDER to CC_CNT_DRAWINGS_CURRENT_VAR
dtypes: float64(553)
memory usage: 983.5 MB
None

Info y :
Classe Series
Type float64
Shape (230634,)


In [34]:
# On libère la mémoire CPU et GPU
del train
gc.collect()
cp._default_memory_pool.free_all_blocks()

In [None]:
pipe = Pipeline(
    [
        ("imputer", WithColNames(SimpleImputer(copy=False))),
        # ("std", VarianceSelector(copy=False)),
        ("scaler", WithColNames(StandardScaler(copy=False))),
    ]
)

In [36]:
X_imputed = pipe.fit_transform(X)
X_imputed.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,CC_AMT_BALANCE_MEAN,CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN,CC_AMT_CREDIT_LIMIT_ACTUAL_SUM,CC_AMT_DRAWINGS_ATM_CURRENT_MEAN,CC_AMT_DRAWINGS_ATM_CURRENT_VAR,CC_AMT_PAYMENT_CURRENT_SUM,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_ATM_CURRENT_VAR,CC_CNT_DRAWINGS_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_VAR
0,0.719318,-0.716468,-0.663017,-0.577923,-0.344564,-0.682496,-0.720626,-0.579867,0.759107,-1.714936,...,0.0,-2.865431e-16,3.402882e-16,-5.725882e-16,-2.25839e-16,-7.845202e-16,0.0,0.0,-3.408963e-16,1.546203e-16
1,0.719318,-0.716468,-0.663017,0.803851,0.041659,-0.146186,-0.383995,0.004782,0.565929,0.862777,...,0.0,-2.865431e-16,3.402882e-16,-5.725882e-16,-2.25839e-16,-7.845202e-16,0.0,0.0,-3.408963e-16,1.546203e-16
2,0.719318,1.395735,1.508258,-0.577923,0.159206,2.115867,1.842182,2.209396,-0.66299,-1.040448,...,0.0,-2.865431e-16,3.402882e-16,-5.725882e-16,-2.25839e-16,-7.845202e-16,0.0,0.0,-3.408963e-16,1.546203e-16
3,0.719318,-0.716468,-0.663017,-0.577923,-0.092679,-0.369649,-0.28515,-0.238822,-0.813199,1.821031,...,0.0,-2.865431e-16,3.402882e-16,-5.725882e-16,-2.25839e-16,-7.845202e-16,0.0,0.0,-3.408963e-16,1.546203e-16
4,-1.390044,-0.716468,-0.663017,-0.577923,0.713352,1.549669,1.905281,1.588206,-0.979857,-1.658595,...,0.0,-2.865431e-16,3.402882e-16,-5.725882e-16,-2.25839e-16,-7.845202e-16,0.0,0.0,-3.408963e-16,1.546203e-16


# VIF Exécution (sans validation croisée)

In [38]:
selector = VIFSelector(vif_threshold=10)
selector.fit(X_imputed, epsilon=1e-6)

Elimination récursive de features avec un VIF > 10
	Device : GPU
	shape X  : (230634, 553)
	Sans validation croisée

Itérations (100 itérations maximum)...
[Itération 1] (Elapsed time : 0:06:59) Max VIF: NAME_FAMILY_STATUS_Widow = inf
[Itération 2] (Elapsed time : 0:13:54) Max VIF: BURO_CREDIT_ACTIVE_Baddebt_MEAN = inf
[Itération 3] (Elapsed time : 0:20:47) Max VIF: BURO_CREDIT_CURRENCY_currency1_MEAN = inf
[Itération 4] (Elapsed time : 0:27:38) Max VIF: BURO_CREDIT_TYPE_Consumercredit_MEAN = inf
[Itération 5] (Elapsed time : 0:34:28) Max VIF: PREV_NAME_CONTRACT_TYPE_XNA_MEAN = inf
[Itération 6] (Elapsed time : 0:41:16) Max VIF: PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN = inf
[Itération 7] (Elapsed time : 0:48:04) Max VIF: PREV_NAME_CASH_LOAN_PURPOSE_Buildingahouseoranannex_MEAN = inf
[Itération 8] (Elapsed time : 0:54:50) Max VIF: PREV_NAME_PAYMENT_TYPE_Cashlessfromtheaccountoftheemployer_MEAN = inf
[Itération 9] (Elapsed time : 1:00:41) Max VIF: PREV_CODE_REJECT_REASON_VERIF_MEAN

In [44]:
# On supprime ces features
X_imputed = X_imputed.drop(columns=selector.removed_features_)
X_imputed.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 230634 entries, 0 to 230633
Columns: 453 entries, CODE_GENDER to CC_CNT_DRAWINGS_CURRENT_VAR
dtypes: float64(453)
memory usage: 797.1 MB


## Sauvegarde

In [40]:
path_features = os.path.join(DATA_INTERIM, f"features_100_dropped_vif_.pkl")
joblib.dump(selector.removed_features_, path_features)
print(f"Liste des 100 premières features enregistrée dans {path_features}")

Liste des 100 premières features enregistrée dans data/interim/features_100_dropped_vif_.pkl


In [42]:
print(f"{len(selector.removed_features_)} features suprimées :")
print(selector.removed_features_)

100 features suprimées :
['NAME_FAMILY_STATUS_Widow', 'BURO_CREDIT_ACTIVE_Baddebt_MEAN', 'BURO_CREDIT_CURRENCY_currency1_MEAN', 'BURO_CREDIT_TYPE_Consumercredit_MEAN', 'PREV_NAME_CONTRACT_TYPE_XNA_MEAN', 'PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN', 'PREV_NAME_CASH_LOAN_PURPOSE_Buildingahouseoranannex_MEAN', 'PREV_NAME_PAYMENT_TYPE_Cashlessfromtheaccountoftheemployer_MEAN', 'PREV_CODE_REJECT_REASON_VERIF_MEAN', 'PREV_NAME_TYPE_SUITE_Groupofpeople_MEAN', 'PREV_NAME_CLIENT_TYPE_Refreshed_MEAN', 'PREV_NAME_CONTRACT_STATUS_Unusedoffer_MEAN', 'PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN', 'PREV_NAME_GOODS_CATEGORY_AdditionalService_MEAN', 'PREV_NAME_PORTFOLIO_Cars_MEAN', 'PREV_CHANNEL_TYPE_Cardealer_MEAN', 'PREV_NAME_SELLER_INDUSTRY_Autotechnology_MEAN', 'PREV_PRODUCT_COMBINATION_POSotherwithinterest_MEAN', 'PREV_PRODUCT_COMBINATION_NA_MEAN', 'POS_NAME_CONTRACT_STATUS_Active_MEAN', 'PREV_PRODUCT_COMBINATION_CardXSell_MEAN', 'PREV_NAME_YIELD_GROUP_high_MEAN', 'PREV_PRODUCT_COMBINATION_CashXSe

In [43]:
print(f"{len(selector.selected_features_)} features restantes :")
print(selector.selected_features_)

453 features restantes :
['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'BASEMENTAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'LIVINGAREA_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'LANDAREA_MEDI', 'TOTALAREA_MODE', 'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLA

## Poursuite de l'élimination itérative

In [45]:
selector = VIFSelector(vif_threshold=10)
selector.fit(X_imputed, epsilon=1e-6, max_iter=50)

Elimination récursive de features avec un VIF > 10
	Device : GPU
	shape X  : (230634, 453)
	Sans validation croisée

Itérations (50 itérations maximum)...
[Itération 1] (Elapsed time : 0:04:17) Max VIF: APPROVED_AMT_GOODS_PRICE_MIN = 19.97
[Itération 2] (Elapsed time : 0:08:32) Max VIF: INSTAL_DAYS_ENTRY_PAYMENT_MEAN = 19.95
[Itération 3] (Elapsed time : 0:12:46) Max VIF: BURO_DAYS_CREDIT_MEAN = 19.21
[Itération 4] (Elapsed time : 0:17:00) Max VIF: CREDIT_TO_INCOME_RATIO = 16.01
[Itération 5] (Elapsed time : 0:21:13) Max VIF: POS_SK_DPD_DEF_MAX = 15.86
[Itération 6] (Elapsed time : 0:25:24) Max VIF: APPROVED_AMT_ANNUITY_MIN = 15.48
[Itération 7] (Elapsed time : 0:29:34) Max VIF: NAME_TYPE_SUITE_Unaccompanied = 14.99
[Itération 8] (Elapsed time : 0:33:44) Max VIF: PREV_NAME_GOODS_CATEGORY_Mobile_MEAN = 14.58
[Itération 9] (Elapsed time : 0:37:50) Max VIF: CLOSED_AMT_CREDIT_SUM_MAX = 14.57
[Itération 10] (Elapsed time : 0:41:58) Max VIF: APPROVED_APP_CREDIT_PERC_MEAN = 14.34
[Itération 1

## Sauvegarde

### Sauvegardes des listes de features

In [46]:
path_features = os.path.join(DATA_INTERIM, f"features_29_dropped_vif_.pkl")
joblib.dump(selector.removed_features_, path_features)
print(f"Liste des 29 dernières features supprimées enregistrée dans {path_features}")

Liste des 29 dernières features supprimées enregistrée dans data/interim/features_29_dropped_vif_.pkl


In [47]:
print(f"{len(selector.removed_features_)} features suprimées :")
print(selector.removed_features_)

29 features suprimées :
['APPROVED_AMT_GOODS_PRICE_MIN', 'INSTAL_DAYS_ENTRY_PAYMENT_MEAN', 'BURO_DAYS_CREDIT_MEAN', 'CREDIT_TO_INCOME_RATIO', 'POS_SK_DPD_DEF_MAX', 'APPROVED_AMT_ANNUITY_MIN', 'NAME_TYPE_SUITE_Unaccompanied', 'PREV_NAME_GOODS_CATEGORY_Mobile_MEAN', 'CLOSED_AMT_CREDIT_SUM_MAX', 'APPROVED_APP_CREDIT_PERC_MEAN', 'FLAG_DOCUMENT_3', 'YEARS_BEGINEXPLUATATION_MEDI', 'BURO_DAYS_CREDIT_UPDATE_MEAN', 'FONDKAPREMONT_MODE_NA', 'BURO_AMT_CREDIT_SUM_MEAN', 'APPROVED_AMT_DOWN_PAYMENT_MIN', 'AMT_GOODS_PRICE', 'BURO_AMT_CREDIT_MAX_OVERDUE_MEAN', 'PREV_NAME_SELLER_INDUSTRY_Clothing_MEAN', 'ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN', 'HOUSETYPE_MODE_NA', 'ACTIVE_CREDIT_DAY_OVERDUE_MAX', 'PREV_CNT_PAYMENT_MEAN', 'REGION_RATING_CLIENT', 'PREV_NAME_SELLER_INDUSTRY_Consumerelectronics_MEAN', 'POS_MONTHS_BALANCE_MAX', 'REFUSED_DAYS_DECISION_MEAN', 'PREV_AMT_APPLICATION_MEAN', 'APPROVED_HOUR_APPR_PROCESS_START_MAX']


In [48]:
print(f"{len(selector.selected_features_)} features restantes :")
print(selector.selected_features_)

424 features restantes :
['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'BASEMENTAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'LIVINGAREA_MODE', 'LANDAREA_MEDI', 'TOTALAREA_MODE', 'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FL

### Sauvegarde du Train

In [52]:
path_features = os.path.join(DATA_INTERIM, f"features_selected_vif.pkl")
joblib.dump(selector.selected_features_, path_features)
print(f"Liste des features restantes enregistrée dans {path_features}")

Liste des features restantes enregistrée dans data/interim/features_selected_vif.pkl


In [51]:
train = cudf.read_csv(os.path.join(DATA_INTERIM, "01_v1_miss_200_train.csv"))
train = train[["SK_ID_CURR", "TARGET"] + selector.selected_features_]
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 230634 entries, 0 to 230633
Columns: 426 entries, SK_ID_CURR to CC_CNT_DRAWINGS_CURRENT_VAR
dtypes: bool(146), float64(249), int64(31)
memory usage: 532.2 MB


In [53]:
filepath_train = os.path.join(DATA_CLEAN_DIR, "01_v2_vif_train.csv")
train.to_csv(filepath_train, index=False)
print(
    f"train après suppression des colonnes au VIF > 10 enregistré dans {filepath_train}"
)

train après suppression des colonnes au VIF > 10 enregistré dans data/cleaned/01_v2_vif_train.csv


## VIf des features restantes

In [None]:
## VIF < 5 et VIF entre 5 et 10
vifs = selector.get_vif()

features_vif_inf_5 = [k for k, v in vifs.items() if v < 5.0]
print(f"{len(features_vif_inf_5)} features ont un VIF < 5:")
print(features_vif_inf_5)

features_vif_between_5_10 = [k for k, v in vifs.items() if v > 5.0]
print(
    f'\n{len(features_vif_between_5_10)} features sont "à surveiller" (VIF compris entre 5 et 10) :'
)
print(features_vif_between_5_10)

376 features ont un VIF < 5:
['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_CITY_NOT_LIVE_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'BASEMENTAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'LIVINGAREA_MODE', 'LANDAREA_MEDI', 'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUM

# VIF et dataset déséquilibré

In [None]:
"""
le dataset est déséquilibré, or la procédure repose sur des régressions linéaires (donc des corrélations).
Les R2 obtenus (donc les VIFs) auraient été différents en ré-équilibrant le dataset auparavant.

Pour tenter d'illustrer cela, nous allons créer un petit dataset tout simple.
"""

In [None]:
"""
Nous créons un dataset avec un ratio de défaut de 8%.
Il comprend
- une feature "to_keep" égale à la target. Elle est donc 100% explicative, nous voulons la garder
- une feature "to_drop" très corrélée à la feature to_drop. Celle_ci est égale à to_keep sur la classe majoritaire
mais comporte de légères différences dans la classe minoritaire. Elle n'est donc pas prédictive à 100% de la target.
- deux features complètement aléatoires (non corrélées à rien)

La procédure VIF devrait nous éliminer ou "to_keep" ou "to_drop" car celles-ci sont corrélées (complètement égales sur la classe majoritaire).
Cependant, nous préfèrerions garder "to_keep", puisque c'est la meilleure feature et éliminer les deux features aléatoires qui sont inutiles.
"""

In [208]:
import cudf
import cupy as cp

cp.random.seed(1)

# Nombre de lignes
n_rows = 1000

# Pourcentage de 1 dans la target
target_ones_ratio = 0.08
n_ones = int(n_rows * target_ones_ratio)
n_zeros = n_rows - n_ones

# Création de la colonne target avec 8% de 1, 92% de 0
# target = cp.array([1] * n_ones + [0] * n_zeros)
# cp.random.shuffle(target)
target = cp.array([0] * n_zeros + [1] * n_ones)

# to_keep est identique à target
to_keep = target.copy()

# to_drop : tout à 0 sauf UNE ligne aléatoire où target == 0 → 1
"""to_drop = cp.zeros(n_rows, dtype=cp.int32)
zero_indices = cp.where(target == 1)[0]
# random_zero_idx = int(cp.random.choice(zero_indices, 1))
random_zero_idx = int(cp.random.choice(zero_indices, 1)[0])
to_drop[random_zero_idx] = 1"""
# to_drop : tout à 0, sauf aux 3 premières lignes où target == 1 → 1
to_drop = cp.zeros(n_rows, dtype=cp.int32)
one_indices = cp.where(target == 1)[0]
selected_indices = one_indices[:73]  # Prend les 3 premiers
to_drop[selected_indices] = 1

# random1 : valeurs float aléatoires entre 0 et 1
random1 = cp.random.rand(n_rows)

# random2 : valeurs aléatoires entières entre 0 et 100
random2 = cp.random.randint(0, 100, size=n_rows)

# Création du DataFrame cuDF
df = cudf.DataFrame(
    {
        "random1": random1,
        "random2": random2,
        "target": target,
        "to_keep": to_keep,
        "to_drop": to_drop,
    }
)

print(df.head())

    random1  random2  target  to_keep  to_drop
0  0.724492        2       0        0        0
1  0.438243       90       0        0        0
2  0.505169       48       0        0        0
3  0.933551       77       0        0        0
4  0.791575       94       0        0        0


In [211]:
df[df["target"] == 1].tail()

Unnamed: 0,random1,random2,target,to_keep,to_drop
995,0.011827,47.0,1.0,1.0,0.0
996,0.165524,67.0,1.0,1.0,0.0
997,0.658897,79.0,1.0,1.0,0.0
998,0.947329,74.0,1.0,1.0,0.0
999,0.838986,32.0,1.0,1.0,0.0


In [None]:
selector_tmp = VIFSelector(vif_threshold=10.0)
df = df.astype(np.float64)
X_tmp = df.drop(columns=["target"])
y_tmp = df["target"]
selector_tmp.fit(X=X_tmp)

Elimination récursive de features avec un VIF > 10
	Device : GPU
	shape X  : (1000, 4)
	Sans validation croisée

Itérations (30 itérations maximum)...
[Itération 1] (Elapsed time : 0:00:00) Max VIF: to_keep = 10.60
[Itération 2] (Elapsed time : 0:00:00) Max VIF: to_drop = 1.00
Durée du Fit (hh:mm:ss) : 0:00:00
1 features ont un VIF trop élevé :
['to_keep']
3 features restantes


In [None]:
"""
Ici nous éliminons la feature que nous voudrions conserver.
to_keep est 100% identique à  la target, tandis que "to_drop" a 7 lignes différentes (sur 1000 lignes au total et sur 80 de la classe minoritaire)
Certes, il n'est pas grave d'échanger "to_keep" par "to_drop", tant que nous ne rééquilibrons pas le dataset.
En effet, avec le déséquilibre actuel, les deux features sont très corrélées entre elles et très corrélées avec la target.

Mais si nous rééquilibrons le dataset, to_keep sera toujours 100% explicative de la target, tandis que to_drop le sera moins.
Sur un dataset tout simple comme cet exemple, on peut voir que l'impact du déséquilibre n'est pas important sur le VIF (sans ré-équilibrage).

Mais qu'en est-il sur un dataset volumineux comme le nôtre, qui comporte des multi_colinéarités complexes ?
L'élimination des features grâce au VIF améliorera-t-elle obligatoirement la performance des modèles ?
C'est plus difficile à évaluer.
"""