In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, matthews_corrcoef
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
pd.set_option('display.max_columns', None)



In [2]:
def replace_non_alpha_with_nan(df, categories):

    cols_to_filter = categories
    

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    # * Customized feature engineering
    features_dict = {
        'cap_shape': ['x', 'f', 's', 'b', 'o', 'p', 'c'],
        'cap_surface': ['t', 's', 'y', 'h', 'g', 'd', 'k', 'e', 'i', 'w', 'l'],
        'cap_color': ['n', 'y', 'w', 'g', 'e', 'o', 'p', 'r', 'u', 'b', 'k', 'l'],
        'does_bruise_or_bleed': ['f', 't'],
        'gill_attachment': ['a', 'd', 'x', 'e', 's', 'p', 'f'],
        'gill_spacing': ['c', 'd', 'f'],
        'gill_color': ['w', 'n', 'y', 'p', 'g', 'o', 'k', 'f', 'r', 'e', 'b', 'u'],
        'stem_root': ['b', 's', 'r', 'c', 'f'],
        'stem_surface': ['s', 'y', 'i', 't', 'g', 'k', 'h', 'f'],
        'stem_color': ['w', 'n', 'y', 'g', 'o', 'e', 'u', 'p', 'k', 'r', 'l', 'b', 'f'],
        'veil_type': ['u'],
        'veil_color': ['w', 'y', 'n', 'u', 'k', 'e'],
        'has_ring': ['f', 't'],
        'ring_type': ['f', 'e', 'z', 'l', 'r', 'p', 'g', 'm'],
        'spore_print_color': ['k', 'p', 'w', 'n', 'r', 'u', 'g'],
        'habitat': ['d', 'g', 'l', 'm', 'h', 'w', 'p', 'u'],
        'season': ['a', 'u', 'w', 's']
    }


    for classes, cols  in zip(features_dict.keys(), cols_to_filter):
        # df.loc[(~df[cols].isin(features_dict[classes])) & pd.notna(df[cols]), cols] = 'missing'
        df.loc[~df[cols].isin(features_dict[classes]), cols] = np.nan

    return df

In [3]:
def encode_train_and_test_data(df_train, target, kaggle_test, num_cols, cat_cols, *variable):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        ('SimpleImputer', SimpleImputer(strategy='constant', fill_value = 'missing')),
        # ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        # ('Target', TargetEncoder(smoothing=variable)),
        # ('OneHot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
        # ('adjust', FunctionTransformer(lambda x : x + 1)) # * Adjust function
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ],
        remainder = 'passthrough'
    )


    all_columns = num_cols + cat_cols

    train_te = preprocessor.fit_transform(df_train[all_columns])

    feature_names_out = preprocessor.get_feature_names_out()

    df_train_final = pd.DataFrame(train_te, columns = feature_names_out)

    kaggle_test_te = preprocessor.transform(kaggle_test[all_columns])
    kaggle_test_final = pd.DataFrame(kaggle_test_te, columns = feature_names_out)



    num_cols_to_convert = [col for col in df_train_final.columns if col.startswith('num')]
    df_train_final[num_cols_to_convert] = df_train_final[num_cols_to_convert].astype('float64')
    kaggle_test_final[num_cols_to_convert] = kaggle_test_final[num_cols_to_convert].astype('float64')

    print(df_train_final.info())

    return df_train_final, kaggle_test_final

In [4]:
def get_cols(df):
        
    cat_cols = [col for col in df.select_dtypes(['object', 'category']).columns if col != 'class']
    num_cols = [col for col in df.select_dtypes('number').columns]
    print(f'Categorical columns:\n {cat_cols}\n')
    print(f'Numeric columns:\n {num_cols}')

    return cat_cols, num_cols

In [5]:
def handle_missing_data(df):

        cat_cols = df.select_dtypes(include = ['category', 'object']).columns
        imputer_cat = SimpleImputer(strategy='constant', fill_value = 'missing')
        df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

        # * KNN Imputer on num_cols used during encoding function
        print(df.isnull().sum())
        return df

In [6]:
def convert_to_categoricals(df):
    for col in df.select_dtypes('object').columns.tolist():
        df[col] = df[col].astype('category')
    return df

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

X = train.copy()
y = X.pop('class')

In [8]:
cat_cols, num_cols = get_cols(train);

Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width']


In [9]:
X = replace_non_alpha_with_nan(X, cat_cols)
test = replace_non_alpha_with_nan(test, cat_cols)

In [10]:
lbl_enc = LabelEncoder()
y = lbl_enc.fit_transform(y)

In [11]:
X, test = encode_train_and_test_data(X, y, test, num_cols, cat_cols)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 20 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   num__cap-diameter          float64
 1   num__stem-height           float64
 2   num__stem-width            float64
 3   cat__cap-shape             object 
 4   cat__cap-surface           object 
 5   cat__cap-color             object 
 6   cat__does-bruise-or-bleed  object 
 7   cat__gill-attachment       object 
 8   cat__gill-spacing          object 
 9   cat__gill-color            object 
 10  cat__stem-root             object 
 11  cat__stem-surface          object 
 12  cat__stem-color            object 
 13  cat__veil-type             object 
 14  cat__veil-color            object 
 15  cat__has-ring              object 
 16  cat__ring-type             object 
 17  cat__spore-print-color     object 
 18  cat__habitat               object 
 19  cat__season                object 
dtypes:

In [12]:
X = convert_to_categoricals(X)
test = convert_to_categoricals(test)


In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 20 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   num__cap-diameter          float64 
 1   num__stem-height           float64 
 2   num__stem-width            float64 
 3   cat__cap-shape             category
 4   cat__cap-surface           category
 5   cat__cap-color             category
 6   cat__does-bruise-or-bleed  category
 7   cat__gill-attachment       category
 8   cat__gill-spacing          category
 9   cat__gill-color            category
 10  cat__stem-root             category
 11  cat__stem-surface          category
 12  cat__stem-color            category
 13  cat__veil-type             category
 14  cat__veil-color            category
 15  cat__has-ring              category
 16  cat__ring-type             category
 17  cat__spore-print-color     category
 18  cat__habitat               category
 19  cat__season          

In [14]:
def model_report(estimator, X, y, cv = 5):
    print('=' * 80)
    print(f"    Model: {estimator.__class__.__name__}")
    print('='*80)

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 1/cv, shuffle = True, stratify = y, random_state = 42)
    
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)

    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print(f"F1 Score : {f1.mean()}")
    print(f"MCC Score : {mcc.mean()}")

In [15]:
def model_trainer(model, X, y, n_splits = 5, random_state = 42):
    skfold = StratifiedKFold(n_splits=n_splits, shuffle = True, random_state = random_state)
    oof_probs, oof_mccs = [], []
    print('='*80)
    print(f"Training {model.__class__.__name__}")
    print('='*80)

    for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
        X_train, y_train = X.iloc[train_idx, :], y[train_idx]
        X_test, y_test = X.iloc[test_idx, :], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mcc = matthews_corrcoef(y_pred, y_test)
        oof_mccs.append(mcc)
        oof_probs.append(model.predict_proba(test))
        
        print(f"--- Fold {fold + 1} MCC Score: {mcc:.6f}")
    print(f"\n---> Mean MCC Score: {np.mean(oof_mccs):.6f} \xb1 {np.std(oof_mccs):.6f}\n\n")
    return oof_probs, oof_mccs



In [16]:
xgb_clf = XGBClassifier(
        device = 'cuda',
        colsample_bytree = 0.6,
        max_depth = 14,
        min_child_weight = 7,
        random_state = 42,
        n_estimators = 200,
        enable_categorical = True,
        tree_method = 'hist'
)

model_report(xgb_clf, X, y)

    Model: XGBClassifier


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




F1 Score : 0.9923326886135493
MCC Score : 0.9830889501318758


In [118]:
lgb_clf = LGBMClassifier(device = 'gpu', verbosity = -1)

model_report(lgb_clf, X, y)

    Model: LGBMClassifier
F1 Score : 0.991324014274319
MCC Score : 0.9808721304083006


In [119]:
skfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
    # print(test_idx)
    X_train, y_train = X.iloc[train_idx, :], y[train_idx]
    X_test, y_test = X.iloc[test_idx, :], y[test_idx]

    print(len(X_train))
    print(len(X_test))

2493556
623389
2493556
623389
2493556
623389
2493556
623389
2493556
623389


In [121]:
xgb_params = {
    'n_estimators': 2407,
    'eta': 0.009462133032592785,
    'gamma': 0.2865859948765318,
    'max_depth': 31,
    'min_child_weight': 47,
    'subsample': 0.6956431754146083,
    'colsample_bytree': 0.3670732604094118,
    'grow_policy': 'lossguide',
    'max_leaves': 73,
    'enable_categorical': True,
    'n_jobs': -1,
    'device': 'cuda',
    'tree_method': 'hist'
} # 0.9844272567086021

lgb_params = {
    'n_estimators': 2500,
    'random_state':42,
    'max_bin':1024,
    'colsample_bytree':0.6,
    'reg_lambda': 80,
    # 'device': 'gpu',
    'verbosity': -1
}

In [122]:
oof_probs = {}

oof_probs['xgb'], _ = model_trainer(XGBClassifier(**xgb_params), X, y, random_state = 42)

Training XGBClassifier
--- Fold 1 MCC Score: 0.984597
--- Fold 2 MCC Score: 0.984536
--- Fold 3 MCC Score: 0.984572
--- Fold 4 MCC Score: 0.984398
--- Fold 5 MCC Score: 0.984652

---> Mean MCC Score: 0.984551 ± 0.000085




In [127]:
oof_preds = {}

oof_preds['xgb'] = np.argmax(np.mean(oof_probs['xgb'], axis = 0), axis = 1)

In [128]:
preds = oof_preds['xgb']

In [129]:
sub = pd.read_csv('sample_submission.csv')
sub['class'] = lbl_enc.inverse_transform(preds)
sub.to_csv('version11_submission.csv', index = False)
