# Import des outils / jeu de données

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import id_col, quali_var, quanti_var, target
from config import seed
from grid_search import gs_xgboost
from models import create_models
from pipelines import add_original_data, create_x_pipeline, create_y_pipeline
from prediction import evaluate_models, make_prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from scripts.helper import print_shapes
from src.utils import set_plot_options, set_seed

In [None]:
set_seed(seed)
set_plot_options()

In [None]:
original_data = pd.read_csv("data/dataset/raw/original_dataset_train.csv")
df = pd.read_csv("data/dataset/raw/train.csv", index_col=id_col)
X_kaggle = pd.read_csv("data/dataset/raw/test.csv", index_col=id_col)

## Variables globales

In [None]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [None]:
# X = X.head(200)
# y = y.head(200)

In [None]:
# X_original_data = original_data[quanti_var + quali_var].copy()
# y_original_data = original_data[target].copy()

# Liste des modèles

In [None]:
models = create_models(seed)
X_preprocessor = create_x_pipeline()
y_preprocessor = create_y_pipeline()

# Traitement des données

## Pipelines

In [None]:
processed_X = X_preprocessor.fit_transform(X)
processed_y = y_preprocessor.fit_transform(y)

In [None]:
# processed_X_original_data = X_preprocessor.transform(X_original_data)
# processed_y_original_data = y_preprocessor.transform(y_original_data)

In [None]:
# print_shapes(
#     processed_X, processed_y, processed_X_original_data, processed_y_original_data
# )

## Par défaut

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_X,
    processed_y,
    test_size=0.02,
    random_state=seed,
)

In [None]:
# X_train, y_train = add_original_data(
#     X_train, y_train, processed_X_original_data, processed_y_original_data
# )

In [None]:
rf = RandomForestClassifier(random_state=seed)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict_proba(X_test)

In [None]:
log_loss(y_test, y_pred)

0.7503154149676029

## Temp grid search CV

In [None]:
gs_xgboost(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits




## The rest

In [None]:
prefix = "default"
results = evaluate_models(models, prefix, X_train, y_train)

default/DummyClassifier_Uniform
default/DummyClassifier_MostFrequent
default/LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

default/LinearDiscriminantAnalysis
default/RandomForestClassifier
default/ExtraTreesClassifier
default/HistGradientBoostingClassifier
default/XGBClassifier


In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

[['default/HistGradientBoostingClassifier',
  -0.48198606371259406,
  -0.4759807535885364,
  0.023907555530980104,
  -0.5125468948198264,
  1.3103872537612915],
 ['default/RandomForestClassifier',
  -0.5054842296203518,
  -0.5000509058621925,
  0.05643169643428746,
  -0.6065295120813797,
  1.2968123435974122],
 ['default/XGBClassifier',
  -0.5062352715642113,
  -0.5092192361621619,
  0.02822518817269309,
  -0.5479016954850433,
  0.7240914583206177],
 ['default/LinearDiscriminantAnalysis',
  -0.5169430819964754,
  -0.5234863321152963,
  0.029402681195064027,
  -0.558899652559518,
  0.011648964881896973],
 ['default/LogisticRegression',
  -0.522609793644708,
  -0.5229718842766597,
  0.019526622422614284,
  -0.5495756139549913,
  0.1257333755493164],
 ['default/ExtraTreesClassifier',
  -0.5626779763429315,
  -0.5658760953781434,
  0.08028446426407705,
  -0.7159243413832691,
  0.6473781585693359],
 ['default/DummyClassifier_Uniform',
  -1.0986122886681096,
  -1.0986122886681096,
  0.0,
  -

# Save & submit

## Save the results

In [None]:
results_df = pd.DataFrame(
    results,
    columns=["Estimator", "Mean Score", "Median Score", "Std", "Min Score", "Fit Time"],
)
results_df

Unnamed: 0,Estimator,Mean Score,Median Score,Std,Min Score,Fit Time
0,default/DummyClassifier_Uniform,-1.098612,-1.098612,0.0,-1.098612,0.001476
1,default/DummyClassifier_MostFrequent,-13.387238,-13.394287,0.0154,-13.411592,0.001452
2,default/LogisticRegression,-0.52261,-0.522972,0.019527,-0.549576,0.125733
3,default/LinearDiscriminantAnalysis,-0.516943,-0.523486,0.029403,-0.5589,0.011649
4,default/RandomForestClassifier,-0.505484,-0.500051,0.056432,-0.60653,1.296812
5,default/ExtraTreesClassifier,-0.562678,-0.565876,0.080284,-0.715924,0.647378
6,default/HistGradientBoostingClassifier,-0.481986,-0.475981,0.023908,-0.512547,1.310387
7,default/XGBClassifier,-0.506235,-0.509219,0.028225,-0.547902,0.724091


In [None]:
results_df.to_csv(f"data/results/{prefix}.csv", index=False)

## Kaggle submission

In [None]:
best_model = models["HistGradientBoostingClassifier"]
submission_name = "hgb"

In [None]:
best_model = XGBClassifier(
    **{
        "colsample_bytree": 0.3,
        "learning_rate": 0.05,
        "max_depth": 6,
        "n_estimators": 1000,
    },
    random_state=0
)

In [None]:
best_model.fit(processed_X, processed_y)

In [None]:
multi_output = True
multi_output_columns = ("Status_C", "Status_CL", "Status_D")

In [None]:
def make_prediction(
    model,  # todo: maybe type isn't 100% accurate
    X_train: pd.DataFrame | np.ndarray,
    y_train: pd.DataFrame | np.ndarray,
    X_kaggle: pd.DataFrame | np.ndarray,
    X_preprocessor,  # todo: maybe type isn't 100% accurate
    y_preprocessor,  # todo: maybe type isn't 100% accurate
) -> pd.DataFrame:
    """todo"""
    model.fit(X_train, y_train)

    X_kaggle_processed = pd.DataFrame(
        X_preprocessor.transform(X_kaggle),
        # columns=X_preprocessor.get_feature_names_out(), #fixme: redo this line
    )

    if multi_output:
        raw_predictions = model.predict_proba(X_kaggle_processed)
    else:
        raw_predictions = model.predict_proba(X_kaggle_processed)[:, 1]

    # y_pred = y_preprocessor.inverse_transform(raw_predictions)
    print(f"{raw_predictions.shape}\n")

    if multi_output:
        df = pd.DataFrame(
            raw_predictions, index=X_kaggle.index, columns=multi_output_columns
        )
    else:
        df = pd.DataFrame(raw_predictions, index=X_kaggle.index, columns=[target])

    return df

In [None]:
liste_predictions = make_prediction(
    best_model, processed_X, processed_y, X_kaggle, X_preprocessor, y_preprocessor
)

(5271, 3)


In [None]:
liste_predictions.head()

Unnamed: 0_level_0,Status_C,Status_CL,Status_D
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7905,0.193046,0.019565,0.787389
7906,0.583753,0.229401,0.186846
7907,0.014109,0.001518,0.984374
7908,0.983232,0.000455,0.016312
7909,0.915773,0.01277,0.071457


In [None]:
liste_predictions.to_csv(f"data/predictions/{submission_name}.csv")