# Import des outils / jeu de données

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import id_col, quali_var, quanti_var, target
from grid_search import gs_xgboost
from models import create_models
from pipelines import add_original_data, create_x_pipeline, create_y_pipeline
from prediction import evaluate_models, make_prediction
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from scripts.helper import print_shapes

In [None]:
seed = 0
np.random.seed(seed)
sns.set_theme()

In [None]:
original_data = pd.read_csv("data/dataset/raw/original_dataset_train.csv")
df = pd.read_csv("data/dataset/raw/train.csv", index_col=id_col)
X_kaggle = pd.read_csv("data/dataset/raw/test.csv", index_col=id_col)

## Variables globales

In [None]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [None]:
# X = X.head(200)
# y = y.head(200)

In [None]:
# X_original_data = original_data[quanti_var + quali_var].copy()
# y_original_data = original_data[target].copy()

# Liste des modèles

In [None]:
models = create_models(seed)
X_preprocessor = create_x_pipeline()
y_preprocessor = create_y_pipeline()

# Traitement des données

## Pipelines

In [None]:
processed_X = X_preprocessor.fit_transform(X)
processed_y = y_preprocessor.fit_transform(y)

In [None]:
# processed_X_original_data = X_preprocessor.transform(X_original_data)
# processed_y_original_data = y_preprocessor.transform(y_original_data)

In [None]:
# print_shapes(
#     processed_X, processed_y, processed_X_original_data, processed_y_original_data
# )

## Par défaut

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_X,
    processed_y,
    test_size=0.02,
    random_state=seed,
)

In [None]:
# X_train, y_train = add_original_data(
#     X_train, y_train, processed_X_original_data, processed_y_original_data
# )

## Temp grid search CV

In [None]:
gs_xgboost(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits




## The rest

In [None]:
prefix = "default"
results = evaluate_models(models, prefix, X_train, y_train)

default/DummyClassifier_Uniform
default/DummyClassifier_MostFrequent
default/LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

default/LinearDiscriminantAnalysis
default/RandomForestClassifier
default/ExtraTreesClassifier
default/HistGradientBoostingClassifier
default/XGBClassifier
default/CatBoostClassifier
default/LGBMClassifier


In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

[['default/CatBoostClassifier',
  0.86939049947713,
  0.8697813418537219,
  0.0028850911475621728,
  0.8626808104119322,
  142.0995687007904],
 ['default/XGBClassifier',
  0.8658128720297661,
  0.8659900434224346,
  0.0028590243244780125,
  0.8589050317937557,
  2.7265148639678953],
 ['default/LGBMClassifier',
  0.865301242077507,
  0.8659187857858939,
  0.002775161309252307,
  0.858888852805787,
  329.16275947093965],
 ['default/HistGradientBoostingClassifier',
  0.8649252730240166,
  0.8657238869715709,
  0.0029132510516593512,
  0.8580186752414358,
  4.73275887966156],
 ['default/RandomForestClassifier',
  0.8575066976272352,
  0.8576049095143162,
  0.002696786722353164,
  0.8516472579444003,
  35.204959177970885],
 ['default/ExtraTreesClassifier',
  0.8543388199638932,
  0.8545000916336492,
  0.0031349581589606265,
  0.8473015183617245,
  29.717409133911133],
 ['default/LinearDiscriminantAnalysis',
  0.8419975396545076,
  0.8423493672578874,
  0.0019844802428356416,
  0.83747478155

# Save & submit

## Save the results

In [None]:
results_df = pd.DataFrame(
    results,
    columns=["Estimator", "Mean Score", "Median Score", "Std", "Min Score", "Fit Time"],
)

In [None]:
results_df.to_csv(f"data/results/{prefix}.csv", index=False)

## Kaggle submission

In [None]:
best_model = models["HistGradientBoostingClassifier"]
submission_name = "xgb"

In [None]:
best_model = XGBClassifier(
    **{
        "colsample_bytree": 0.3,
        "learning_rate": 0.05,
        "max_depth": 6,
        "n_estimators": 1000,
    },
    random_state=0
)

In [None]:
best_model.fit(processed_X, processed_y)

In [None]:
liste_predictions = make_prediction(
    best_model, processed_X, processed_y, X_kaggle, X_preprocessor, y_preprocessor
)

In [None]:
liste_predictions.to_csv(f"data/predictions/{submission_name}.csv")