In [1]:
import sys
import matplotlib.pyplot as plt
import pandas as pd
from joblib import dump, load

In [2]:
import os

current_directory = os.getcwd()
print(current_directory)

c:\Users\hugph\ML\s4e8


In [13]:
train = pd.read_csv(f'train.csv', index_col = 0)
test = pd.read_csv(f'test.csv', index_col = 0)

target = 'class'

In [14]:
import pandas as pd
import numpy as np


def convert_data_types(train, test, target_column):
    train = train.copy()
    test = test.copy()

    target = train[target_column].copy()

    train = train.drop(columns = [target_column])

    combined = pd.concat([train, test], keys = ['train', 'test'])

    def process_column(df, col):
        if df[col].dtype == 'object':
            df[col] = df[col].fillna('unk').astype('category')
        elif df[col].dtype.name == 'category':
            if df[col].isna().any():
                df[col] = df[col].cat.add_categories('unk').fillna('unk')
        return df

    for column in combined.columns:
        combined = process_column(combined, column)

    new_train = combined.loc['train'].copy()
    new_test = combined.loc['test'].copy()

    new_train[target_column] = target

    return new_train, new_test

In [15]:
train, test = convert_data_types(train, test, target)

In [21]:
from sklearn.preprocessing import LabelEncoder

In [24]:
le = LabelEncoder()

train[target] = le.fit_transform(train[target])

train[target]

id
0          0
1          1
2          0
3          0
4          0
          ..
3116940    0
3116941    0
3116942    1
3116943    0
3116944    1
Name: class, Length: 3116945, dtype: int64

In [25]:
categorical_features = train.select_dtypes(include = ['object', 'category']).columns.tolist()

categorical_features

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [None]:
import numpy as np
import optuna
import pandas as pd
from lightgbm import LGBMClassifier, early_stopping
from optuna.samplers import TPESampler
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from typing import Dict, List, Tuple, Union

class Model_gbdt:
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame, target: str, categorical_feats: List[str], base_params = None):
        if base_params is None:
            base_params = {}
        
        self.train = train
        self.test = test
        self.model_dict: Dict[str, LGBMClassifier] = {}
        self.test_predict_list: List[np.ndarray] = []
        self.categorical_feats = categorical_feats
        self.target = target
        self.base_params = base_params
    
    def objective(self, trial: optuna.Trial) -> float:
        params = {
            "max_depth": trial.suggest_int("max_depth", 2, 10),
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "n_estimators": trial.suggest_int("n_estimators", 50, 1500),
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 100),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "lambda_l1": trial.suggest_float("lambda_l1", 0, 1),
            "lambda_l2": trial.suggest_float("lambda_l2", 0, 1),
            "min_split_gain": trial.suggest_float("min_split_gain", 0, 1),
            "max_bin": trial.suggest_int("max_bin", 10, 1000),
            "max_delta_step": trial.suggest_float("max_delta_step", 0, 1),
            "path_smooth": trial.suggest_float("path_smooth", 0, 10),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
            "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-8, 1.0),
            "max_cat_threshold": trial.suggest_int("max_cat_threshold", 2, 32),
            "cat_l2": trial.suggest_float("cat_l2", 0, 100),
            "cat_smooth": trial.suggest_float("cat_smooth", 0, 100),
            "max_cat_to_onehot": trial.suggest_int("max_cat_to_onehot", 2, 32),
            'cat_features': self.categorical_feats,
            **self.base_params
        }
        scores, _, _, = self.fit(params)
        return np.mean(scores)
    
    def fit(self, params: Dict[str, Union[int, float, str, bool, List[str]]]) -> Tuple[List[float], List[np.ndarray], np.ndarray]:
        label_columns = [self.target]
        train_cols = [col for col in self.train.columns.to_list() if col not in label_columns]
        scores = []
        mskf = StratifiedKFold(n_splits = 5, shuffle = True)
        oof_valid_preds = np.zeros((self.train[train_cols].shape[0], len(label_columns)))
        for fold, (train_idx, valid_idx) in enumerate(mskf.split(self.train[train_cols], self.train[label_columns])):
            
            X_train, y_train = self.train[train_cols].iloc[train_idx], self.train[label_columns].iloc[train_idx].values.ravel()
            X_valid, y_valid = self.train[train_cols].iloc[valid_idx], self.train[label_columns].iloc[valid_idx].values.ravel()

            model = LGBMClassifier(**params)
            model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], callbacks = [early_stopping(250)])

            valid_preds = model.predict(X_valid)
            oof_valid_preds[valid_idx] = valid_preds.reshape(-1,1)

            test_predict = model.predict_proba(self.test[train_cols])[:,1]
            self.test_predict_list.append(test_predict)

            mcc = matthews_corrcoef(y_valid, valid_preds)
            scores.append(mcc)
            self.model_dict[f'fold_{fold}'] = model

        oof_score = matthews_corrcoef(self.train[label_columns], oof_valid_preds) # * mcc of all training data and all validation predictions from all folds
        scores.append(oof_score)
        print(f'The average Matthews Correlation Coefficient is {np.mean(scores)}')

        return scores, self.test_predict_list, oof_valid_preds
    
    def optimize(self, n_trials: int = 100) -> Dict[str, Union[int, float, str, bool]]:
        study = optuna.create_study(direction = 'maximize', sampler = TPESampler(seed = 42))
        study.optimize(self.objective, timeout = n_trials, show_progress_bar = True)

        print("Best trial:")
        trial = study.best_trial
        print(" Value:", trial.value)
        print(" Params:")
        for key, value in trial.params.items():
            print(f" {key}: {value}")
        return study.best_params