In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings('ignore')

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e8/sample_submission.csv
/kaggle/input/playground-series-s4e8/train.csv
/kaggle/input/playground-series-s4e8/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")

In [3]:
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv")

# ***PREPROCESSING***

In [4]:
features_to_drop = ['id']
train = train.drop(features_to_drop, axis=1)

In [5]:
test_sub = test.drop(features_to_drop, axis=1)

In [6]:
# Binarize the class labels
train['class'] = train['class'].apply(lambda x: 0 if x == 'e' else 1)

In [7]:
X = train.drop('class', axis=1)
y = train['class']

In [8]:
cat = X.select_dtypes(include='object').columns.tolist()
threshold = len(X) * 0.0005

for col in cat: 
    value_counts = X[col].value_counts()
    
    labels = set(value_counts[value_counts > threshold].index)
    
    def preprocessor(x):
        if x not in labels: 
            return 'other'
        if x is None: 
            return 'missing'
        
        return x 
    
    X[col] = X[col].apply(preprocessor)
    test_sub[col] = test_sub[col].apply(preprocessor)
    

In [9]:
X = pd.get_dummies(X, columns=cat, drop_first=True)
test_sub = pd.get_dummies(test_sub, columns=cat, drop_first=True)

# ***MODEL PERFORMANCE***

In [10]:
X = X.to_numpy()
y = y.to_numpy()
test_sub = test_sub.to_numpy()

In [11]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

model = XGBClassifier() 

folds = 3
skfolds = StratifiedKFold(n_splits=folds)

total_mcc = []

for train_index, test_index in skfolds.split(X, y):
    X_train_folds, y_train_folds = X[train_index], y[train_index]
    X_test_folds, y_test_folds = X[test_index], y[test_index]
    
    model.fit(X_train_folds, y_train_folds)
    y_pred_folds = model.predict(X_test_folds)
    
    mcc = matthews_corrcoef(y_test_folds, y_pred_folds)
    total_mcc.append(mcc)
    
mcc_cv = sum(total_mcc) / len(total_mcc)
mcc_cv

0.9820015260630216

# ***SUBMISSION***

In [12]:
model.fit(X, y)

In [13]:
pred_class = model.predict(test_sub)

test['class'] = pred_class
test['class'] = test['class'].apply(lambda x: 'e' if x == 0 else 'p')

submission = test[['id', 'class']]

In [14]:
submission.to_csv('/kaggle/working/VER4.csv', index=False)