# Import des outils / jeu de données

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import id_col, quali_var, quanti_var, target
from prediction import create_models, evaluate_models, make_prediction
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_regression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    recall_score,
)
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    RobustScaler,
    StandardScaler,
)

In [2]:
SEED = 0
np.random.seed(SEED)
sns.set_theme()

In [28]:
original_data = pd.read_csv("data/original_dataset_train.csv")

In [35]:
df = pd.read_csv("data/train.csv", index_col=id_col)

In [36]:
df = pd.concat((original_data, df))

## Variables globales

In [37]:
LABELS = df[target].unique()

In [38]:
X = df[quanti_var + quali_var].copy()
y = df[target].copy()

In [39]:
le = LabelEncoder()
y = le.fit_transform(y)

# Liste des modèles

In [40]:
models = create_models(SEED)

# Traitement des données

## Pipeline Scaler & OneHotEncoding

In [41]:
var_cat_non_ohe = [
    "surgery",
    "age",
    "capillary_refill_time",
    "nasogastric_reflux",
    "surgical_lesion",
    "cp_data",
]

In [42]:
var_cat_ohe = [
    "temp_of_extremities",
    "peripheral_pulse",
    "mucous_membrane",
    "pain",
    "peristalsis",
    "abdominal_distention",
    "nasogastric_tube",
    "rectal_exam_feces",
    "abdomen",
    "abdomo_appearance",
]  # quali_var

In [43]:
quanti_processor = Pipeline(
    steps=[
        ("imputer", SimpleImputer()),
        ("scaler", StandardScaler()),
    ]
)

In [44]:
quali_processor = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        (
            "encoder",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
        ),
    ]
)

In [45]:
preprocessor = ColumnTransformer(
    remainder="passthrough",
    transformers=[
        (
            "quali_ohe",
            OneHotEncoder(handle_unknown="infrequent_if_exist", sparse_output=False),
            var_cat_ohe,
        ),
        ("quali_non_ohe", quali_processor, var_cat_non_ohe),
        ("quanti_processor", quanti_processor, quanti_var),
    ],
)

In [46]:
nouveau_df = pd.DataFrame(
    preprocessor.fit_transform(X),
    index=df.index,
    columns=preprocessor.get_feature_names_out(),
)

In [47]:
nouveau_df.shape

(1534, 79)

## Par défaut

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    nouveau_df, y, test_size=0.2, random_state=SEED
)

In [49]:
prefix = "défaut"
results = evaluate_models(models, prefix, X_train, y_train)

défaut/DummyClassifier_Uniform
défaut/DummyClassifier_MostFrequent
défaut/KNeighborsClassifier5
défaut/LinearSVC




défaut/LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

défaut/LinearDiscriminantAnalysis
défaut/RandomForestClassifier
défaut/XGBClassifier
défaut/CatBoostClassifier
défaut/LGBMClassifier
[['défaut/LGBMClassifier', 0.7302079168332667, 0.0360492185095391], ['défaut/XGBClassifier', 0.7203851792616287, 0.05087913491842679], ['défaut/CatBoostClassifier', 0.7179394908703186, 0.038338609160074796], ['défaut/RandomForestClassifier', 0.7, 0.04089569699427146], ['défaut/LinearDiscriminantAnalysis', 0.6739304278288685, 0.05260557545411276], ['défaut/LinearSVC', 0.6723044115687058, 0.048964848776472884], ['défaut/LogisticRegression', 0.6682060509129681, 0.05346744010235199], ['défaut/KNeighborsClassifier5', 0.6445755031320806, 0.06154547736417482], ['défaut/DummyClassifier_MostFrequent', 0.4930694388911102, 0.003182714630483682], ['défaut/DummyClassifier_Uniform', 0.34308276689324274, 0.05011327073780119]]


In [50]:
sorted(results, key=lambda x: x[1], reverse=True)

[['défaut/LGBMClassifier', 0.7302079168332667, 0.0360492185095391],
 ['défaut/XGBClassifier', 0.7203851792616287, 0.05087913491842679],
 ['défaut/CatBoostClassifier', 0.7179394908703186, 0.038338609160074796],
 ['défaut/RandomForestClassifier', 0.7, 0.04089569699427146],
 ['défaut/LinearDiscriminantAnalysis',
  0.6739304278288685,
  0.05260557545411276],
 ['défaut/LinearSVC', 0.6723044115687058, 0.048964848776472884],
 ['défaut/LogisticRegression', 0.6682060509129681, 0.05346744010235199],
 ['défaut/KNeighborsClassifier5', 0.6445755031320806, 0.06154547736417482],
 ['défaut/DummyClassifier_MostFrequent',
  0.4930694388911102,
  0.003182714630483682],
 ['défaut/DummyClassifier_Uniform', 0.34308276689324274, 0.05011327073780119]]

# Soumission Kaggle

In [51]:
X_kaggle = pd.read_csv("data/test.csv", index_col=id_col)

In [52]:
X_kaggle.head()

Unnamed: 0_level_0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1235,no,adult,534053,38.6,40.0,20.0,normal,normal,normal_pink,less_3_sec,...,distend_small,42.0,7.5,clear,2.3,no,0,0,0,no
1236,yes,adult,528469,38.2,112.0,48.0,cool,reduced,bright_pink,more_3_sec,...,distend_small,44.0,6.0,serosanguious,2.6,no,2208,0,0,yes
1237,yes,adult,528178,37.7,66.0,12.0,cool,normal,bright_red,less_3_sec,...,distend_small,31.5,6.0,cloudy,1.6,yes,2205,0,0,yes
1238,no,adult,534784,37.1,88.0,20.0,cool,reduced,pale_cyanotic,less_3_sec,...,distend_large,75.0,81.0,,1.0,yes,1400,0,0,no
1239,yes,adult,529840,38.3,50.0,12.0,,normal,bright_pink,less_3_sec,...,distend_small,37.0,6.8,cloudy,2.6,yes,2208,0,0,yes


In [53]:
best_model = models["LGBMClassifier"]
best_model

In [54]:
liste_predictions = make_prediction(
    best_model, X_train, y_train, X_kaggle, preprocessor, le
)

In [56]:
submission_name = "original_data"

In [57]:
liste_predictions.to_csv(f"data/results/{submission_name}.csv")