Датасет на енкодери:
https://archive.ics.uci.edu/ml/datasets/Mushroom
Задача - спробувати і розібратись як кодують


*   первинний аналіз даних (відстуність пропусків, наявність категоріальних фіч, duplicated, Nan)
*   фича інжиніринг (побудувати 1-2 нові фічі)
*   Убрать кореляцію в фічах
*   поділ датасету на train, validate, test (+ random_state)

*   GridSearchCV
*   Scaling

*   тренування базової моделі із дефолтними гіперпараметрами (кожну модель)
*   підбір гіперпараметрів (кожну модель)
*   побудувати модель різними способами
*   Metrics: classification_report, ROC-AUC, precision_recall_curve ...
*   оцінка результатів (порівняння всіх на тестовій частині, описати яка краще)
*  Додати ще один Балансер
*  Додати ще один скалер


## INIT BLOCK

### Import Requirements

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, classification_report,
                             precision_score, recall_score, f1_score)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder

### Init

In [None]:
RANDOM_STATE: int = 1729


def mount_google_drive() -> bool | None:
    """
    Function to mount Google Drive.
    :return: True if mounting is successful, None otherwise.
    """
    try:
        from google.colab.drive import mount

        mount(mountpoint="/content/drive")
        return True

    except Exception as error:
        print(f"Error while mounting Google Drive: {error}")
        raise


def get_data_frame(dataset_path: str, *args, sep: str = ",", **kwargs) -> pd.DataFrame | None:
    """
    Function to convert the dataset into a pd.DataFrame.
    :param sep: Separator for csv data, default - ",".
    :param dataset_path: Path or URL of the dataset.
    :return: pd.DataFrame containing the dataset, or None if errors occur.
    """
    try:
        return pd.read_csv(filepath_or_buffer=dataset_path, sep=sep, **kwargs)

    except Exception as error:
        print(f"Error while converting dataset to NumPy array: {error}")
        raise


def main(dataset_path: str, *args, **kwargs) -> pd.DataFrame | None:
    """
    Main function to start the app.
    :return: pd.DataFrame containing the dataset, or None if errors occur.
    """
    if mount_google_drive():
        import os

        if os.path.exists(path=dataset_path):
            return get_data_frame(dataset_path=dataset_path, *args, **kwargs)

        else:
            print("Dataset path doesn't exists.")
            raise FileNotFoundError


if __name__ == "__main__":
    glass_dataframe: pd.DataFrame | None = main(dataset_path="/content/drive/MyDrive/Hillel/Machine_Learning_Course/HW7/mushroom/agaricus-lepiota.data", header=None)

    if isinstance(glass_dataframe, pd.DataFrame):
        working_dataframe: pd.DataFrame = glass_dataframe.copy()

        target_column_name: str = "poisonous"
        features: list = ["cap_shape", "cap_surface", "cap_color", "bruises?", "odor", "gill_attachment",
                         "gill_spacing", "gill_size", "gill_color", "stalk_shape", "stalk_root",
                         "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring",
                         "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", "ring_type",
                         "spore_print_color", "population", "habitat"]
        headers: list = [target_column_name] + features


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data analysis, clining and optomization

### Перевіряємо на дублікати

In [None]:
print(f"{'Є дублікати, потрібна обробка датасету.' if working_dataframe.duplicated().sum() else 'Дублікатів немає.'}", end="\n\n")

Дублікатів немає.


### Обробка відсутніх відсутніх значеннь

In [None]:
print(f"{'Є пропущені значення, потрібна обробка датасету.' if working_dataframe.isna().any().any() else 'Пропущених значень немає.'}", end="\n\n")

working_dataframe.info()
working_dataframe.head()

Пропущених значень немає.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   object
 2   2       8124 non-null   object
 3   3       8124 non-null   object
 4   4       8124 non-null   object
 5   5       8124 non-null   object
 6   6       8124 non-null   object
 7   7       8124 non-null   object
 8   8       8124 non-null   object
 9   9       8124 non-null   object
 10  10      8124 non-null   object
 11  11      8124 non-null   object
 12  12      8124 non-null   object
 13  13      8124 non-null   object
 14  14      8124 non-null   object
 15  15      8124 non-null   object
 16  16      8124 non-null   object
 17  17      8124 non-null   object
 18  18      8124 non-null   object
 19  19      8124 non-null   object
 20  20      8124 non-null   object
 21  21      8124 non-null   object
 2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### Додаємо хедери

In [None]:
working_dataframe.columns = headers

working_dataframe.info()
working_dataframe.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   poisonous                 8124 non-null   object
 1   cap_shape                 8124 non-null   object
 2   cap_surface               8124 non-null   object
 3   cap_color                 8124 non-null   object
 4   bruises?                  8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill_attachment           8124 non-null   object
 7   gill_spacing              8124 non-null   object
 8   gill_size                 8124 non-null   object
 9   gill_color                8124 non-null   object
 10  stalk_shape               8124 non-null   object
 11  stalk_root                8124 non-null   object
 12  stalk_surface_above_ring  8124 non-null   object
 13  stalk_surface_below_ring  8124 non-null   object
 14  stalk_color_above_ring  

Unnamed: 0,poisonous,cap_shape,cap_surface,cap_color,bruises?,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### Генеруємо статистику

In [None]:
working_dataframe.describe(include="all", percentiles=[.25, .5, .75])

Unnamed: 0,poisonous,cap_shape,cap_surface,cap_color,bruises?,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


## Feature Engineering

### Feature Encoding

In [None]:
target_array: pd.Series = working_dataframe[target_column_name]

encoder: OneHotEncoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8)
encoded_features = encoder.fit_transform(X=working_dataframe[features])

working_dataframe: pd.DataFrame = pd.DataFrame(data=encoded_features, columns=encoder.get_feature_names_out())
working_dataframe[target_column_name]: pd.Series = target_array

working_dataframe.info()
working_dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Columns: 118 entries, cap_shape_b to poisonous
dtypes: int8(117), object(1)
memory usage: 991.8+ KB


Unnamed: 0,cap_shape_b,cap_shape_c,cap_shape_f,cap_shape_k,cap_shape_s,cap_shape_x,cap_surface_f,cap_surface_g,cap_surface_s,cap_surface_y,...,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,poisonous
0,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,p
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,e
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,e
3,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,p
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,e
8120,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,e
8121,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,e
8122,0,0,0,1,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,p


## Preparing data for training

### Розбиваємо датасет на фічі та таргет

In [None]:
x_features: pd.DataFrame = working_dataframe.copy().drop(target_column_name, axis=1)
y_target: pd.Series = working_dataframe[target_column_name]

### Масштабування фічів

Так як всі фічі є категоріальними і ми закодували їх в "0" та "1" масштабування не потрібно.

### Розбиваємо дані на тренувальну, валідаційну та тестову частину

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.33, random_state=RANDOM_STATE, stratify=y_target)
x_validate, x_test, y_validate, y_test = train_test_split(x_test, y_test, test_size=0.33, random_state=RANDOM_STATE, stratify=y_test)

print(x_train.shape)
print(x_validate.shape)
print(x_test.shape)

(5443, 117)
(1796, 117)
(885, 117)


### Перевіряємо розподіл классів

In [None]:
print(f"Загальний таргет:\n{working_dataframe[target_column_name].value_counts()}", end="\n\n")
print(f"Тренувальний таргет:\n{y_train.value_counts()}")

Загальний таргет:
e    4208
p    3916
Name: poisonous, dtype: int64

Тренувальний таргет:
e    2819
p    2624
Name: poisonous, dtype: int64


Бачімо, що класи майже збалансовані, і різниця невелика ((195 / 2819) * 100 ≈ 6.91%), тому балансування не потрібно.

In [None]:
x_resampled, y_resampled = x_train, y_train

# FIT PREDICT

### Functions for fit models, make predict and generate metrics

In [None]:
def print_model_name(name: str) -> None:
    print("'" * 60)
    print(f"MODEL: {name}")

In [None]:
def value_results_main(model, predict, y_test):
    accuracy = accuracy_score(y_true=y_test, y_pred=predict)
    precision = precision_score(y_true=y_test, y_pred=predict, average="micro")
    recall = recall_score(y_true=y_test, y_pred=predict, average="micro")
    f1 = f1_score(y_true=y_test, y_pred=predict, average="micro")

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    print("'" * 60)

In [None]:
def value_results(model, name:str, predict, y_test):
    print_model_name(name=name)
    value_results_main(model=model, predict=predict, y_test=y_test)

In [None]:
def train_model(model_class, name: str, grid_params=None):
    print_model_name(name=name)

    if grid_params:
        model: GridSearchCV = GridSearchCV(estimator=model_class, param_grid=grid_params).fit(X=x_resampled, y=y_resampled)

        print("Best params: ", model.best_params_)
        print("Best score: ", model.best_score_)

    else:
        model = model_class().fit(X=x_resampled, y=y_resampled)

    y_pred = model.predict(X=x_validate)

    print(f"Classification report:\n{classification_report(y_true=y_validate, y_pred=y_pred)}")

    value_results_main(model=model, predict=y_pred, y_test=y_validate)

    return model


### Тренуємо моделі

In [None]:
## KNeighborsClassifier() with default hyperparams
model_kn_classifier: KNeighborsClassifier = train_model(model_class=KNeighborsClassifier, name="KNeighborsClassifier with default hyperparameters")

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: KNeighborsClassifier with default hyperparameters
Classification report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       930
           p       1.00      1.00      1.00       866

    accuracy                           1.00      1796
   macro avg       1.00      1.00      1.00      1796
weighted avg       1.00      1.00      1.00      1796

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''


In [None]:
## GradientBoostingClassifier() with default hyperparams
model_gboost_classifier: GradientBoostingClassifier = train_model(model_class=GradientBoostingClassifier, name="GradientBoostingClassifier with default hyperparameters")

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: GradientBoostingClassifier with default hyperparameters
Classification report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       930
           p       1.00      1.00      1.00       866

    accuracy                           1.00      1796
   macro avg       1.00      1.00      1.00      1796
weighted avg       1.00      1.00      1.00      1796

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''


In [None]:
## RandomForestClassifier() with default hyperparams
model_random_forest_classifier: RandomForestClassifier = train_model(model_class=RandomForestClassifier, name="RandomForestClassifier with default hyperparameters")

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: RandomForestClassifier with default hyperparameters
Classification report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       930
           p       1.00      1.00      1.00       866

    accuracy                           1.00      1796
   macro avg       1.00      1.00      1.00      1796
weighted avg       1.00      1.00      1.00      1796

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''


In [None]:
## KNeighborsClassifier() з підбіром параметрів через GridSearchCV
grid_kn_clasifier_params = {
    "n_neighbors": list(range(1, 2)),
    "weights": ["uniform",],  # ["uniform", "distance"]
    "metric": ["euclidean",],  # ["euclidean", "manhattan", "cityblock", "l1", "l2", "cosine", "nan_euclidean"]
    "algorithm": ["auto"],  # ["auto", "ball_tree", "kd_tree", "brute"]
    # "leaf_size": list(range(10,22)), # doesn't use
    # "p": [2, 1]  # doesn't use
}
model_kn_classifier_grid = train_model(model_class=KNeighborsClassifier(), name="KNeighborsClassifier with Grid hyperparameters", grid_params=grid_kn_clasifier_params)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: KNeighborsClassifier with Grid hyperparameters
Best params:  {'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
Best score:  1.0
Classification report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       930
           p       1.00      1.00      1.00       866

    accuracy                           1.00      1796
   macro avg       1.00      1.00      1.00      1796
weighted avg       1.00      1.00      1.00      1796

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''


In [None]:
## GradientBoostingClassifier() з підбіром параметрів через GridSearchCV
grid_g_boost_clasifier_params = {
    # "loss": ["log_loss", "exponential"],
    "loss": ["log_loss"],
    "learning_rate": [0.5], # [0.0, inf)
    "n_estimators": [97],  # [1, inf)
    "subsample": [0.9],  # (0.0, 1.0].
    "criterion": ["friedman_mse"], # ["friedman_mse", "squared_error"],
    "min_samples_split": [2], # [2, inf)
    "min_samples_leaf": [1], # [1, inf)
    "min_weight_fraction_leaf": [0.0],  # [0.0,0.5]
    "max_depth": [2],
    "min_impurity_decrease": [0.0],  # > 0
    "max_features": ["log2"], # ["log2", "sqrt"],
    "verbose": [1],
    "warm_start": [True],#only
    "n_iter_no_change": [None], # > 1
}
model_gboost_classifier_grid = train_model(model_class=GradientBoostingClassifier(), name="GradientBoostingClassifier with Grid hyperparameters", grid_params=grid_g_boost_clasifier_params)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: GradientBoostingClassifier with Grid hyperparameters
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.0774           0.2899            0.30s
         2           0.9704           0.0884            0.96s
         3           0.6748           0.2884            0.74s
         4           0.6109           0.0467            0.64s
         5           0.5896           0.0273            0.57s
         6           0.5305           0.0594            0.58s
         7           0.4701           0.0664            0.54s
         8           0.4403           0.0330            0.49s
         9           0.3568           0.0958            0.46s
        10           0.3271           0.0312            0.43s
        20           0.0867           0.0353            0.44s
        30           0.0492           0.0022            0.41s
        40           0.0289           0.0010            0.37s
        50

In [None]:
## RandomForestClassifier() з підбіром параметрів через GridSearchCV
grid_g_boost_clasifier_params = {
        "n_estimators": [70,],
        # "criterion": ["gini", "entropy", "log_loss"],
        "criterion": ["entropy",],
        # "max_features": ["sqrt", "log2", None],
        "max_features": ["log2"],
        # "bootstrap": [True, False],
        "bootstrap": [True],
        # "oob_score": [True, False],
        "oob_score": [True],
        "warm_start": [False],
        # "warm_start": [True, False],
        "class_weight": ["balanced_subsample"],
        # "class_weight": ["balanced", "balanced_subsample"],
}
model_random_forest_classifier_grid = train_model(model_class=RandomForestClassifier(), name="RandomForestClassifier with Grid hyperparameters", grid_params=grid_g_boost_clasifier_params)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: RandomForestClassifier with Grid hyperparameters
Best params:  {'bootstrap': True, 'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 70, 'oob_score': True, 'warm_start': False}
Best score:  1.0
Classification report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       930
           p       1.00      1.00      1.00       866

    accuracy                           1.00      1796
   macro avg       1.00      1.00      1.00      1796
weighted avg       1.00      1.00      1.00      1796

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''


### Порівняння результатів

In [None]:
value_results(model=model_kn_classifier, name="KNeighborsClassifier with default hyperparameters",
              predict=model_kn_classifier.predict(X=x_test), y_test=y_test)

value_results(model=model_kn_classifier_grid, name="KNeighborsClassifier with Grid hyperparameters",
              predict=model_kn_classifier_grid.predict(X=x_test), y_test=y_test)

value_results(model=model_gboost_classifier, name="GradientBoostingClassifier with default hyperparameters",
              predict=model_gboost_classifier.predict(X=x_test), y_test=y_test)

value_results(model=model_gboost_classifier_grid, name="GradientBoostingClassifier with Grid hyperparameters",
              predict=model_gboost_classifier_grid.predict(X=x_test), y_test=y_test)

value_results(model=model_random_forest_classifier, name="RandomForestClassifier with default hyperparameters",
              predict=model_random_forest_classifier.predict(X=x_test), y_test=y_test)

value_results(model=model_random_forest_classifier_grid, name="RandomForestClassifier with Grid hyperparameters",
              predict=model_random_forest_classifier_grid.predict(X=x_test), y_test=y_test)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: KNeighborsClassifier with default hyperparameters
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: KNeighborsClassifier with Grid hyperparameters
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: GradientBoostingClassifier with default hyperparameters
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: GradientBoostingClassifier with Grid hyperparameters
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

# Висновок

Тести проводились на різних комбінаціх моделей без додаткових фічів.
Всі моделі показали максимальний результат.