# Import des outils / jeu de données

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import id_col, quali_var, quanti_var, target
from models import create_models
from pipelines import add_original_data, create_x_pipeline, create_y_pipeline
from prediction import evaluate_models, make_prediction
from sklearn.model_selection import train_test_split

from scripts.helper import print_shapes

In [None]:
seed = 0
np.random.seed(seed)
sns.set_theme()

In [None]:
original_data = pd.read_csv("data/original_dataset_train.csv", na_values=["?"])
df = pd.read_csv("data/train.csv", index_col=id_col)
X_kaggle = pd.read_csv("data/test.csv", index_col=id_col)

## Variables globales

In [None]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [None]:
X_original_data = original_data[quanti_var + quali_var].copy()
y_original_data = original_data[target].copy()

# Liste des modèles

In [None]:
models = create_models(seed)
X_preprocessor = create_x_pipeline()
y_preprocessor = create_y_pipeline()

# Traitement des données

## Pipelines

In [None]:
processed_X = X_preprocessor.fit_transform(X)
processed_y = y_preprocessor.fit_transform(y)

In [None]:
processed_X_original_data = X_preprocessor.transform(X_original_data)
processed_y_original_data = y_preprocessor.transform(y_original_data)

In [None]:
print_shapes(
    processed_X, processed_y, processed_X_original_data, processed_y_original_data
)

processed_X.shape = (101763, 21)
processed_y.shape = (101763,)
processed_X_original_data.shape = (10885, 21)
processed_y_original_data.shape = (10885,)


## Par défaut

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_X,
    processed_y,
    test_size=0.01,
    random_state=seed,
)

In [None]:
# X_train, y_train = add_original_data(
#     X_train, y_train, processed_X_original_data, processed_y_original_data
# )

In [None]:
prefix = "power-transform"
results = evaluate_models(models, prefix, X_train, y_train)

power-transform/DummyClassifier_Uniform
power-transform/DummyClassifier_MostFrequent
power-transform/LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

power-transform/LinearDiscriminantAnalysis
power-transform/RandomForestClassifier
power-transform/ExtraTreesClassifier
power-transform/HistGradientBoostingClassifier
power-transform/XGBClassifier
power-transform/CatBoostClassifier
power-transform/LGBMClassifier


In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

[['power-transform/HistGradientBoostingClassifier',
  0.7912534228800155,
  0.7895923755931439,
  0.00605286700149455,
  0.7840968427923529,
  12.325129508972168],
 ['power-transform/LGBMClassifier',
  0.7907716815219443,
  0.7891580012984978,
  0.005910100919670564,
  0.7829616693468475,
  15.189393281936646],
 ['power-transform/CatBoostClassifier',
  0.790436252786308,
  0.7893153823577113,
  0.005885947198995791,
  0.7830160733249787,
  1062.6231586933136],
 ['power-transform/LinearDiscriminantAnalysis',
  0.7858464216404266,
  0.7846082120710365,
  0.0055726585736622465,
  0.779443742237944,
  6.14178204536438],
 ['power-transform/LogisticRegression',
  0.784884635719114,
  0.7830585843404488,
  0.005457430561786699,
  0.7782054121021214,
  23.85860276222229],
 ['power-transform/XGBClassifier',
  0.784162961562605,
  0.7827911663239906,
  0.005834325948250765,
  0.7748055711473014,
  133.32487440109253],
 ['power-transform/RandomForestClassifier',
  0.7694265596799802,
  0.76870591

In [None]:
[
    [
        "power-transform/HistGradientBoostingClassifier",
        0.7894395367147898,
        0.005860469311042137,
    ],
    ["power-transform/LGBMClassifier", 0.7892241745861941, 0.005513608306759759],
    ["power-transform/CatBoostClassifier", 0.7885757061535291, 0.005875874263518331],
    [
        "power-transform/LinearDiscriminantAnalysis",
        0.7853991589989746,
        0.005039098545103792,
    ],
]

# Save & submit

## Save the results

In [None]:
results_df = pd.DataFrame(
    results,
    columns=["Estimator", "Mean Score", "Median Score", "Std", "Min Score", "Fit Time"],
)

In [None]:
results_df.to_csv(f"data/results/{prefix}.csv", index=False)

## Kaggle submission

In [None]:
best_model = models["HistGradientBoostingClassifier"]
submission_name = "power-transform"

In [None]:
liste_predictions = make_prediction(
    best_model, processed_X, processed_y, X_kaggle, X_preprocessor, y_preprocessor
)

In [None]:
liste_predictions.to_csv(f"data/predictions/{submission_name}.csv")