# Import des outils / jeu de données

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import id_col, quali_var, quanti_var, target
from prediction import (
    add_original_data,
    create_models,
    create_x_pipeline,
    create_y_pipeline,
    evaluate_models,
    make_prediction,
)
from sklearn.model_selection import train_test_split

from scripts.helper import print_shapes

In [None]:
SEED = 0
np.random.seed(SEED)
sns.set_theme()

In [None]:
original_data = pd.read_csv("data/original_dataset_train.csv")
df = pd.read_csv("data/train.csv", index_col=id_col)
X_kaggle = pd.read_csv("data/test.csv", index_col=id_col)

## Variables globales

In [None]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [None]:
X_original_data = original_data[quanti_var + quali_var].copy()
y_original_data = original_data[target].copy()

# Liste des modèles

In [None]:
models = create_models(SEED)
X_preprocessor = create_x_pipeline()
y_preprocessor = create_y_pipeline()

# Traitement des données

## Pipelines

In [None]:
processed_X = X_preprocessor.fit_transform(X)
processed_y = y_preprocessor.fit_transform(y)

In [None]:
processed_X_original_data = X_preprocessor.transform(X_original_data)
processed_y_original_data = y_preprocessor.transform(y_original_data)

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: '?'

In [None]:
print_shapes(
    processed_X, processed_y, processed_X_original_data, processed_y_original_data
)

NameError: name 'processed_X_original_data' is not defined

## Par défaut

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_X,
    processed_y,
    test_size=0.2,
    random_state=SEED,
)

In [None]:
X_train, y_train = add_original_data(
    X_train, y_train, processed_X_original_data, processed_y_original_data
)

NameError: name 'processed_X_original_data' is not defined

In [None]:
prefix = "défaut"
results = evaluate_models(models, prefix, X_train, y_train)

défaut/DummyClassifier_Uniform
défaut/DummyClassifier_MostFrequent
défaut/KNeighborsClassifier5
défaut/LinearSVC




défaut/LogisticRegression
défaut/LinearDiscriminantAnalysis
défaut/RandomForestClassifier
défaut/XGBClassifier
défaut/CatBoostClassifier
défaut/LGBMClassifier


In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

[['défaut/LGBMClassifier', 0.7890647600726886, 0.005721219884766251],
 ['défaut/CatBoostClassifier', 0.7886836357752831, 0.005990954496163058],
 ['défaut/XGBClassifier', 0.7824958908980149, 0.0049096771705528335],
 ['défaut/LinearSVC', 0.7804629386099445, 0.006285261717020909],
 ['défaut/LogisticRegression', 0.7783761595522817, 0.006898149261480394],
 ['défaut/LinearDiscriminantAnalysis',
  0.7752789677736074,
  0.006606206149622996],
 ['défaut/RandomForestClassifier', 0.768326376577785, 0.006601345639290572],
 ['défaut/KNeighborsClassifier5', 0.7228585296707217, 0.00745246179014146],
 ['défaut/DummyClassifier_Uniform', 0.5, 0.0],
 ['défaut/DummyClassifier_MostFrequent', 0.5, 0.0]]

In [None]:
[
    ["défaut/LGBMClassifier", 0.7890647600726886, 0.005721219884766251],
    ["défaut/CatBoostClassifier", 0.7886836357752831, 0.005990954496163058],
    ["défaut/XGBClassifier", 0.7824958908980149, 0.0049096771705528335],
    ["défaut/LinearSVC", 0.7804629386099445, 0.006285261717020909],
    ["défaut/LogisticRegression", 0.7783761595522817, 0.006898149261480394],
    ["défaut/LinearDiscriminantAnalysis", 0.7752789677736074, 0.006606206149622996],
    ["défaut/RandomForestClassifier", 0.768326376577785, 0.006601345639290572],
    ["défaut/KNeighborsClassifier5", 0.7228585296707217, 0.00745246179014146],
    ["défaut/DummyClassifier_Uniform", 0.5, 0.0],
    ["défaut/DummyClassifier_MostFrequent", 0.5, 0.0],
]

[['défaut/LGBMClassifier', 0.7226320251937984, 0.05131130800567399],
 ['défaut/LGBMClassifier', 0.7327216569767442, 0.05401907759948498],
 ['défaut/LGBMClassifier', 0.7241884689922481, 0.05918404365777867],
 ['défaut/LGBMClassifier', 0.7280826065891473, 0.04706282329868854]]

# Soumission Kaggle

In [None]:
best_model = models["LGBMClassifier"]
submission_name = "first_submission"

In [None]:
liste_predictions = make_prediction(
    best_model, processed_X, processed_y, X_kaggle, X_preprocessor, y_preprocessor
)

In [None]:
liste_predictions.to_csv(f"data/results/{submission_name}.csv")