# Import des outils / jeu de données

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import (
    id_col,
    quali_var,
    quali_var_binary,
    quali_var_for_ohe,
    quanti_var,
    target,
)
from prediction import (
    create_models,
    create_x_pipeline,
    evaluate_models,
    make_prediction,
)
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_regression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    recall_score,
)
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    RobustScaler,
    StandardScaler,
)

In [2]:
SEED = 0
np.random.seed(SEED)
sns.set_theme()

In [3]:
original_data = pd.read_csv("data/original_dataset_train.csv")
df = pd.read_csv("data/train.csv", index_col=id_col)
X_kaggle = pd.read_csv("data/test.csv", index_col=id_col)

## Variables globales

In [4]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [5]:
X_original_data = original_data[quanti_var + quali_var].copy()
y_original_data = original_data[target].copy()

# Liste des modèles

In [6]:
models = create_models(SEED)

# Traitement des données

## Pipeline Scaler & OneHotEncoding

In [7]:
preprocessor = create_x_pipeline()

In [8]:
nouveau_df = pd.DataFrame(
    preprocessor.fit_transform(X),
    index=df.index,
    columns=preprocessor.get_feature_names_out(),
)

In [9]:
nouveau_df.shape

(1235, 69)

In [10]:
le = LabelEncoder()
y = le.fit_transform(y)

In [11]:
nouveau_X_original_data = pd.DataFrame(
    preprocessor.transform(X_original_data),
    index=X_original_data.index,
    columns=preprocessor.get_feature_names_out(),
)
y_encoded_original_data = le.transform(y_original_data)

## Par défaut

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    nouveau_df, y, test_size=0.2, random_state=SEED
)

In [13]:
X_train_with_original_data = pd.concat((X_train, nouveau_X_original_data))
y_train_with_original_data = np.hstack((y_train, y_encoded_original_data))

In [14]:
print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_train_with_original_data.shape = }")
print(f"{y_train_with_original_data.shape = }")

X_train.shape = (988, 69)
y_train.shape = (988,)
X_train_with_original_data.shape = (1287, 69)
y_train_with_original_data.shape = (1287,)


In [18]:
X_train = X_train_with_original_data
y_train = y_train_with_original_data

In [20]:
prefix = "défaut"
results = evaluate_models(models, prefix, X_train, y_train)

défaut/DummyClassifier_Uniform
défaut/DummyClassifier_MostFrequent
défaut/LGBMClassifier


In [21]:
sorted(results, key=lambda x: x[1], reverse=True)

[['défaut/LGBMClassifier', 0.7226320251937984, 0.05131130800567399],
 ['défaut/DummyClassifier_MostFrequent',
  0.49416787790697675,
  0.0026137333854799978],
 ['défaut/DummyClassifier_Uniform', 0.36982800387596904, 0.027225387160455063]]

# Soumission Kaggle

In [25]:
best_model = models["LGBMClassifier"]
submission_name = "original_data3"

In [26]:
liste_predictions = make_prediction(
    best_model, X_train, y_train, X_kaggle, preprocessor, le
)

In [27]:
liste_predictions.to_csv(f"data/results/{submission_name}.csv")