In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings('ignore')

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e8/sample_submission.csv
/kaggle/input/playground-series-s4e8/train.csv
/kaggle/input/playground-series-s4e8/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv")

test_id = test['id']

# ***PREPROCESSING***

In [3]:
def preprocess(train, test):
    target = train['class']
    train = train.drop('class', axis=1)
    
    combined = pd.concat([train, test], keys=['train', 'test'])
    cat_features = combined.select_dtypes(include='object').columns

    for col in cat_features: 
        combined[col] = combined[col].fillna('missing').astype('category')
        
    new_train = combined.loc['train'].copy()
    new_test = combined.loc['test'].copy()
    
    new_train['class'] = target
    
    return new_train, new_test

train, test = preprocess(train, test)

In [4]:
X = train.drop(['class', 'id'], axis=1)
y = train['class']

In [5]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

cat_features = X.select_dtypes(include='category').columns

encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
transformer = ColumnTransformer([('encoder', encoder, cat_features)],
                                  remainder='passthrough')

le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
X = transformer.fit_transform(X)
test = transformer.transform(test)

# ***HYPERPARAMETER TUNING***

### ***XGBOOST***

In [7]:
from xgboost import XGBClassifier

from skopt import BayesSearchCV
from skopt.space import Real, Integer

search_spaces = {
    'max_depth': Integer(1, 14),
    'colsample_bytree': Real(0.1, 1),
    'colsample_bylevel': Real(0.1, 1),
    'learning_rate': Real(0.01, 0.3, 'log-uniform'),
    'n_estimators': Integer(50, 1500),
    'gamma': Real(1e-05, 1, 'log-uniform'),
    'reg_lambda': Real(1e-05, 1e+03, 'log-uniform'),
    'reg_alpha': Real(1e-05, 1e+03, 'log-uniform'),
    'min_child_weight': Integer(1, 10),
    'subsample': Real(0.5, 1)
}

xgb = XGBClassifier(seed=42, param_grid=search_spaces)
bayes_search = BayesSearchCV(estimator=xgb, search_spaces=search_spaces, 
                            scoring='matthews_corrcoef', cv=3, verbose=3,
                            return_train_score=True)

# bayes_search.fit(X, y)
# bayes_search.best_params_

# ***MODEL PERFORMANCE***

In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

folds = 3 
skfolds = StratifiedKFold(n_splits=folds)

def CV(model, X, y):
    mcc_sum = 0
    count = 0
    
    for train_index, test_index in skfolds.split(X, y): 
        X_train_folds, y_train_folds = X[train_index], y[train_index]
        X_test_fold, y_test_fold = X[test_index], y[test_index]
        
        model.fit(X_train_folds, y_train_folds) 
        y_pred_fold = model.predict(X_test_fold)
        
        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
        mcc_sum += mcc
        count += 1

        print(f"Fold: {count}, mcc: {mcc}")
        
    return mcc_sum / folds

In [9]:
params = {
    'colsample_bytree': 0.3316064481783387,
    'gamma': 0.00015261120814033454,
    'learning_rate': 0.031261807252523,
    'max_depth': 12,
    'n_estimators': 515,
    'reg_lambda': 0.029800874667296647,
}

xgboost = XGBClassifier(seed=42)
xgboost.set_params(**params)

In [10]:
# mcc_cv = CV(xgboost, X, y)
# mcc_cv

# ***SUBMISSION***

In [11]:
xgboost.fit(X, y)

In [12]:
y_pred = xgboost.predict(test)
submission = pd.DataFrame()

submission['id'] = test_id
submission['class'] = le.inverse_transform(y_pred)

In [13]:
submission.to_csv('/kaggle/working/VER10.csv', index=False)