# Import des outils / jeu de données

In [38]:
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import id_col, quali_var, quanti_var, target
from models import create_models
from pipelines import (
    add_original_data,
    create_x_pipeline,
    create_y_pipeline,
)
from prediction import (
    evaluate_models,
    make_prediction,
)
from sklearn.model_selection import train_test_split

from scripts.helper import print_shapes

In [39]:
SEED = 0
np.random.seed(SEED)
sns.set_theme()

In [40]:
original_data = pd.read_csv("data/original_dataset_train.csv", na_values=['?'])
df = pd.read_csv("data/train.csv", index_col=id_col)
X_kaggle = pd.read_csv("data/test.csv", index_col=id_col)

## Variables globales

In [41]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [42]:
X_original_data = original_data[quanti_var + quali_var].copy()
y_original_data = original_data[target].copy()

# Liste des modèles

In [43]:
models = create_models(SEED)
X_preprocessor = create_x_pipeline()
y_preprocessor = create_y_pipeline()

# Traitement des données

## Pipelines

In [44]:
X = X.head(200)
y = y.head(200)

In [45]:
processed_X = X_preprocessor.fit_transform(X)
processed_y = y_preprocessor.fit_transform(y)

In [46]:
processed_X_original_data = X_preprocessor.transform(X_original_data)
processed_y_original_data = y_preprocessor.transform(y_original_data)

In [47]:
print_shapes(
    processed_X, processed_y, processed_X_original_data, processed_y_original_data
)

processed_X.shape = (200, 21)
processed_y.shape = (200,)
processed_X_original_data.shape = (10885, 21)
processed_y_original_data.shape = (10885,)


## Par défaut

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_X,
    processed_y,
    test_size=0.2,
    random_state=SEED,
)

In [49]:
# X_train, y_train = add_original_data(
#     X_train, y_train, processed_X_original_data, processed_y_original_data
# )

In [50]:
prefix = "power-transform"
results = evaluate_models(models, prefix, X_train, y_train)

défaut/DummyClassifier_Uniform
défaut/DummyClassifier_MostFrequent
défaut/LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

défaut/LinearDiscriminantAnalysis
défaut/RandomForestClassifier
défaut/ExtraTreesClassifier
défaut/HistGradientBoostingClassifier
défaut/LGBMClassifier


In [52]:
sorted(results, key=lambda x: x[1], reverse=True)

[['défaut/RandomForestClassifier', 0.8132211538461538, 0.15034726967185358],
 ['défaut/ExtraTreesClassifier', 0.7826923076923077, 0.1397661010155776],
 ['défaut/LGBMClassifier', 0.7806089743589744, 0.12960265937812762],
 ['défaut/HistGradientBoostingClassifier',
  0.7672275641025641,
  0.1561522746660536],
 ['défaut/LogisticRegression', 0.763301282051282, 0.26155402519365534],
 ['défaut/LinearDiscriminantAnalysis', 0.7115384615384615, 0.2403215729299309],
 ['défaut/DummyClassifier_Uniform', 0.5, 0.0],
 ['défaut/DummyClassifier_MostFrequent', 0.5, 0.0]]

# Save & submit

## Save the results

In [53]:
results_df = pd.DataFrame(results, columns=["Estimator", "Score", "Std"])

In [54]:
results_df.to_csv(f"data/results/{prefix}.csv", index=False)

## Kaggle submission

In [None]:
best_model = models["LGBMClassifier"]
submission_name = "first_submission"

In [None]:
liste_predictions = make_prediction(
    best_model, processed_X, processed_y, X_kaggle, X_preprocessor, y_preprocessor
)

In [None]:
liste_predictions.to_csv(f"data/predictions/{submission_name}.csv")