# Import des outils / jeu de données

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import id_col, quali_var, quanti_var, target
#from grid_search import gs_xgboost
from models import create_models
from pipelines import add_original_data, create_x_pipeline, create_y_pipeline
from prediction import evaluate_models, make_prediction
from sklearn.model_selection import train_test_split
#from xgboost import XGBClassifier

#from scripts.helper import print_shapes

In [2]:
seed = 0
np.random.seed(seed)
sns.set_theme()

In [3]:
original_data = pd.read_csv("data/dataset/raw/original_dataset_train.csv")
df = pd.read_csv("data/dataset/raw/train.csv", index_col=id_col)
X_kaggle = pd.read_csv("data/dataset/raw/test.csv", index_col=id_col)

## Variables globales

In [4]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [5]:
# X = X.head(200)
# y = y.head(200)

In [6]:
# X_original_data = original_data[quanti_var + quali_var].copy()
# y_original_data = original_data[target].copy()

# Liste des modèles

In [5]:
models = create_models(seed)
X_preprocessor = create_x_pipeline()
y_preprocessor = create_y_pipeline()

# Traitement des données

## Pipelines

In [6]:
processed_X = X_preprocessor.fit_transform(X)
processed_y = y

In [9]:
# processed_X_original_data = X_preprocessor.transform(X_original_data)
# processed_y_original_data = y_preprocessor.transform(y_original_data)

In [10]:
# print_shapes(
#     processed_X, processed_y, processed_X_original_data, processed_y_original_data
# )

## Par défaut

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_X,
    processed_y,
    test_size=0.05,
    random_state=seed,
)

In [12]:
# X_train, y_train = add_original_data(
#     X_train, y_train, processed_X_original_data, processed_y_original_data
# )

## Temp grid search CV

In [13]:
#gs_xgboost(X_train, y_train)

## The rest

In [8]:
prefix = "default"
results = evaluate_models(models, prefix, X_train, y_train)

default/LinearRegression
default/RandomForestRegressor
default/ExtraTreesRegressor
default/HistGradientBoostingRegressor
default/XGBRegressor
default/CatBoostRegressor
default/LGBMRegressor


In [9]:
sorted(results, key=lambda x: x[1], reverse=True)

[['default/LGBMRegressor',
  -0.634802244927279,
  -0.6296421407331567,
  0.030492625186511135,
  -0.6962835571789032,
  0.4402796745300293],
 ['default/HistGradientBoostingRegressor',
  -0.6432434252743808,
  -0.6403665415777149,
  0.031624428429472086,
  -0.7023350057037463,
  1.109798288345337],
 ['default/ExtraTreesRegressor',
  -0.6466999999999997,
  -0.658999999999998,
  0.04267036442309907,
  -0.7099999999999991,
  5.218614888191223],
 ['default/RandomForestRegressor',
  -0.6527999999999998,
  -0.6507499999999997,
  0.027133190007811826,
  -0.6980000000000017,
  15.131552958488465],
 ['default/CatBoostRegressor',
  -0.6595746414250416,
  -0.6584758554357684,
  0.029105646434010635,
  -0.7092817575025769,
  14.204113602638245],
 ['default/XGBRegressor',
  -0.6596030616760252,
  -0.6726747632026673,
  0.04539035406484487,
  -0.7256546020507812,
  0.709552812576294],
 ['default/LinearRegression',
  -0.9581892945180401,
  -0.9551259435872539,
  0.020832942514837863,
  -1.00270415861

# Save & submit

## Save the results

In [10]:
results_df = pd.DataFrame(
    results,
    columns=["Estimator", "Mean Score", "Median Score", "Std", "Min Score", "Fit Time"],
)
results_df

Unnamed: 0,Estimator,Mean Score,Median Score,Std,Min Score,Fit Time
0,default/LinearRegression,-0.958189,-0.955126,0.020833,-1.002704,0.009451
1,default/RandomForestRegressor,-0.6528,-0.65075,0.027133,-0.698,15.131553
2,default/ExtraTreesRegressor,-0.6467,-0.659,0.04267,-0.71,5.218615
3,default/HistGradientBoostingRegressor,-0.643243,-0.640367,0.031624,-0.702335,1.109798
4,default/XGBRegressor,-0.659603,-0.672675,0.04539,-0.725655,0.709553
5,default/CatBoostRegressor,-0.659575,-0.658476,0.029106,-0.709282,14.204114
6,default/LGBMRegressor,-0.634802,-0.629642,0.030493,-0.696284,0.44028


In [11]:
results_df.to_csv(f"data/results/{prefix}.csv", index=False)

## Kaggle submission

In [33]:
best_model = models["LGBMRegressor"]
submission_name = "LGBMRegressor"

In [None]:
best_model = XGBClassifier(
    **{
        "colsample_bytree": 0.3,
        "learning_rate": 0.05,
        "max_depth": 6,
        "n_estimators": 1000,
    },
    random_state=0
)

In [34]:
best_model.fit(processed_X, processed_y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2480
[LightGBM] [Info] Number of data points in the train set: 10407, number of used features: 11
[LightGBM] [Info] Start training from score 4.647126


In [36]:

def make_prediction(
    model,  # todo: maybe type isn't 100% accurate
    X_train: pd.DataFrame | np.ndarray,
    y_train: pd.DataFrame | np.ndarray,
    X_kaggle: pd.DataFrame | np.ndarray,
    X_preprocessor,  # todo: maybe type isn't 100% accurate
    y_preprocessor,  # todo: maybe type isn't 100% accurate
) -> pd.DataFrame:
    """todo"""
    model.fit(X_train, y_train)

    X_kaggle_processed = pd.DataFrame(
        X_preprocessor.transform(X_kaggle),
        # columns=X_preprocessor.get_feature_names_out(), #fixme: redo this line
    )
    raw_predictions = model.predict(X_kaggle_processed)
    # y_pred = y_preprocessor.inverse_transform(raw_predictions)

    return pd.DataFrame(raw_predictions, index=X_kaggle.index, columns=[target])

In [37]:
liste_predictions = make_prediction(
    best_model, processed_X, processed_y, X_kaggle, X_preprocessor, y_preprocessor
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002393 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2480
[LightGBM] [Info] Number of data points in the train set: 10407, number of used features: 11
[LightGBM] [Info] Start training from score 4.647126


In [38]:
liste_predictions.to_csv(f"data/predictions/{submission_name}.csv")