In [1]:
import pandas as pd
import optuna
from catboost import CatBoostClassifier
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef as mcc
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('data/train.csv')

In [3]:
train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [4]:
train.set_index("id", drop=True, inplace=True)
train.head()

Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,e,8.8,f,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w
3,e,3.88,f,y,g,f,s,,g,4.16,...,,,w,,,f,f,,d,u
4,e,5.85,x,l,w,f,d,,w,3.37,...,,,w,,,f,f,,g,a


In [5]:
y = train['class']
X = train.drop('class', axis = 1)

In [8]:
categorical_columns = X.select_dtypes(include=['object']).columns


In [13]:
category_mappings = {
     #'class': ['p', 'e'],
    'season': ['a', 'u', 'w', 's'],
    'cap-shape': ['x', 'f', 's', 'b', 'o'],
    'cap-surface': ['t', 's', 'y', 'h', 'g'],
    'cap-color': ['n', 'y', 'w', 'g', 'e'],
    'does-bruise-or-bleed': ['f', 't'],
    'gill-attachment': ['a', 'd', 'x', 'e', 's'],
    'gill-spacing': ['c', 'd', 'f'],
    'gill-color': ['w', 'n', 'y', 'p', 'g'],
    'stem-root': ['b', 's', 'r', 'c', 'f'],
    'stem-surface': ['s', 'y', 'i', 't', 'g', 'k', 'h', 'f'],
    'stem-color': ['w', 'n', 'y', 'g', 'o', 'e', 'u', 'p', 'k', 'r', 'l', 'b'],
    'veil-type': ['u', 'w'],
    'veil-color': ['w', 'y', 'n', 'u', 'k', 'e'],
    'has-ring': ['f', 't'],
    'ring-type': ['f', 'e', 'z', 'l', 'r', 'p', 'g', 'm'],
    'spore-print-color': ['k', 'p', 'w', 'n', 'r', 'u', 'g'],
    'habitat': ['d', 'g', 'l', 'm', 'h', 'w', 'p', 'u'],
}

In [14]:
def clean_category(column, df, valid_category,threshold):
    df[column] = df[column].astype(str)
    counts = df[column].value_counts(normalize = True)


    def map_category(value):
        if value.replace('.','').isdigit():
            return 'Other'

        elif value.lower() in [cat.lower() for cat in valid_category]:
            return next(cat for cat in valid_category if cat.lower() == value.lower())
        elif counts.get(value,0) < threshold:
            return 'Other'
        else:
            return value
    df[column] = df[column].apply(map_category)

    return df
    

In [15]:
for column, valid in category_mappings.items():
    X  = clean_category(column,X,valid,0.001)

In [16]:
print("Total Train Features with NaN Values = " + str(X.columns[X.isnull().sum() != 0].size))

Total Train Features with NaN Values = 1


In [17]:
if (X.columns[X.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(X.columns[X.isnull().sum() != 0])))

Features with NaN => ['cap-diameter']


In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 object 
 1   cap-diameter          float64
 2   cap-shape             object 
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-spacing          object 
 8   gill-color            object 
 9   stem-height           float64
 10  stem-width            float64
 11  stem-root             object 
 12  stem-surface          object 
 13  stem-color            object 
 14  veil-type             object 
 15  veil-color            object 
 16  has-ring              object 
 17  ring-type             object 
 18  spore-print-color     object 
 19  habitat               object 
 20  season                object 
dtypes: float64(3), object(18)
memory usage: 523.2+ MB


In [19]:
def mcc_fast(y_true:pd.Series, y_pred:pd.Series):
    y_true = pd.Series(y_true)
    y_pred = pd.Series(y_pred)
    df = pd.concat([y_true, y_pred.reindex(y_true.index)], axis=1)
    df.columns = ['actual','pred']

    # 0=p(poison), 1=e(edible)
    n11f = ((df['actual']=='e') & (df['pred']=='e')).sum()
    n00f = ((df['actual']=='p') & (df['pred']=='p')).sum()
    n10f = ((df['actual']=='e') & (df['pred']=='p')).sum()
    n01f = ((df['actual']=='p') & (df['pred']=='e')).sum()
    n1_f = n11f+n10f
    n0_f = n01f+n00f
    n_0f = n10f+n00f
    n_1f = n11f+n01f
    n1_fs = np.sqrt(n11f+n10f)
    n0_fs = np.sqrt(n01f+n00f)
    n_0fs = np.sqrt(n10f+n00f)
    n_1fs = np.sqrt(n11f+n01f)
    mcc_fast = (n11f*n00f - n10f*n01f) / (n1_fs*n0_fs*n_0fs*n_1fs)
    return mcc_fast




In [20]:
from sklearn.metrics import matthews_corrcoef

def objective(trial, X, y):
    param_grid = param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100,10000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),} 
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    scores = []
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = lgbm.LGBMClassifier(objective="binary", **param_grid,verbosity=-1,random_state=0)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[
                    lgbm.early_stopping(stopping_rounds=100),
                    lgbm.log_evaluation(period=0)
                ],eval_metric = 'mcc')
        preds = model.predict(X_test)
        mcc = matthews_corrcoef(y_test,preds)
        scores.append(mcc)
    print(np.mean(scores))
    return np.mean(scores)


In [21]:
for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')
study = optuna.create_study(direction = 'maximize')
func = lambda trial: objective(trial,X,y)
study.optimize(func, n_trials = 1)
print('Best trial:', study.best_trial.params)

[I 2024-08-22 16:04:18,998] A new study created in memory with name: no-name-ae68de57-8e7c-4ae3-be9a-03e79330696f


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[858]	valid_0's binary_logloss: 0.048499
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[997]	valid_0's binary_logloss: 0.049696
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1101]	valid_0's binary_logloss: 0.046612
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[688]	valid_0's binary_logloss: 0.0488913
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1152]	valid_0's binary_logloss: 0.0482626


[I 2024-08-22 16:06:10,104] Trial 0 finished with value: 0.9795028528933882 and parameters: {'n_estimators': 5976, 'learning_rate': 0.1745161547317059, 'num_leaves': 1260, 'max_depth': 5, 'min_data_in_leaf': 5400, 'lambda_l1': 95, 'lambda_l2': 80, 'min_gain_to_split': 5.3177095187415215, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.9795028528933882.


0.9795028528933882
Best trial: {'n_estimators': 5976, 'learning_rate': 0.1745161547317059, 'num_leaves': 1260, 'max_depth': 5, 'min_data_in_leaf': 5400, 'lambda_l1': 95, 'lambda_l2': 80, 'min_gain_to_split': 5.3177095187415215, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}


In [None]:
#params from optuna
[I 2024-08-25 08:01:08,773] Trial 15 finished with value: 0.9844643695520991 and parameters: {'n_estimators': 6063, 'learning_rate': 0.12359835539214457, 'num_leaves': 2480, 'max_depth': 12, 'min_data_in_leaf': 8500, 'lambda_l1': 15, 'lambda_l2': 80, 'min_gain_to_split': 0.3736139231328375, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 15 with value: 0.9844643695520991.