# Import des outils / jeu de données

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import id_col, quali_var, quanti_var, target
from prediction import (
    add_original_data,
    create_models,
    create_x_pipeline,
    create_y_pipeline,
    evaluate_models,
    make_prediction,
)
from sklearn.model_selection import train_test_split

from scripts.helper import print_shapes

In [None]:
SEED = 0
np.random.seed(SEED)
sns.set_theme()

In [None]:
original_data = pd.read_csv("data/original_dataset_train.csv")
df = pd.read_csv("data/train.csv", index_col=id_col)
X_kaggle = pd.read_csv("data/test.csv", index_col=id_col)

## Variables globales

In [None]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [None]:
X_original_data = original_data[quanti_var + quali_var].copy()
y_original_data = original_data[target].copy()

# Liste des modèles

In [None]:
models = create_models(SEED)
X_preprocessor = create_x_pipeline()
y_preprocessor = create_y_pipeline()

# Traitement des données

## Pipelines

In [None]:
processed_X = X_preprocessor.fit_transform(X)
processed_y = y_preprocessor.fit_transform(y)

In [None]:
processed_X_original_data = X_preprocessor.transform(X_original_data)
processed_y_original_data = y_preprocessor.transform(y_original_data)

In [None]:
print_shapes(
    processed_X, processed_y, processed_X_original_data, processed_y_original_data
)

processed_X.shape = (1235, 70)
processed_y.shape = (1235,)
processed_X_original_data.shape = (299, 70)
processed_y_original_data.shape = (299,)


## Par défaut

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_X,
    processed_y,
    test_size=0.2,
    random_state=SEED,
)

In [None]:
X_train, y_train = add_original_data(
    X_train, y_train, processed_X_original_data, processed_y_original_data
)

In [None]:
prefix = "défaut"
results = evaluate_models(models, prefix, X_train, y_train)

défaut/DummyClassifier_Uniform
défaut/DummyClassifier_MostFrequent
défaut/LGBMClassifier


In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

[['défaut/LGBMClassifier', 0.7327216569767442, 0.05401907759948498],
 ['défaut/DummyClassifier_MostFrequent',
  0.49416787790697675,
  0.0026137333854799978],
 ['défaut/DummyClassifier_Uniform', 0.36982800387596904, 0.027225387160455063]]

In [None]:
[
    ["défaut/LGBMClassifier", 0.7226320251937984, 0.05131130800567399],  # no FE
    [
        "défaut/LGBMClassifier",
        0.7327216569767442,
        0.05401907759948498,
    ],  # deviation temp
    ["défaut/LGBMClassifier", 0.7241884689922481, 0.05918404365777867],  # is generated
    ["défaut/LGBMClassifier", 0.7280826065891473, 0.04706282329868854],  # both
]

[['défaut/LGBMClassifier', 0.7226320251937984, 0.05131130800567399],
 ['défaut/LGBMClassifier', 0.7327216569767442, 0.05401907759948498],
 ['défaut/LGBMClassifier', 0.7241884689922481, 0.05918404365777867],
 ['défaut/LGBMClassifier', 0.7280826065891473, 0.04706282329868854]]

# Soumission Kaggle

In [None]:
best_model = models["LGBMClassifier"]
submission_name = "original_data_deviation_temp"

In [None]:
liste_predictions = make_prediction(
    best_model, processed_X, processed_y, X_kaggle, X_preprocessor, y_preprocessor
)

In [None]:
liste_predictions.to_csv(f"data/results/{submission_name}.csv")