In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv("kr-vs-kp.csv")

### Train / Test Split

In [3]:
def create_features(df):
    df = df.copy()

    categories = [
        'bkblk','bknwy','bkon8','bkona','bkspr','bkxbq',
        'bkxcr','bkxwp','blxwp','bxqsq','cntxt','dsopp',
        'dwipd','hdchk','katri','mulch','qxmsq','r2ar8',
        'reskd','reskr','rimmx','rkxwp','rxmsq','simpl',
        'skach','skewr','skrxp','spcop','stlmt','thrsk',
        'wkcti','wkna8','wknck','wkovl','wkpos','wtoeg', 
        "class"
    ]

    for category in categories:
        le = LabelEncoder()
        df[category] = le.fit_transform(df[category])
    
    X = df.drop("class", axis=1)
    y = df["class"]

    return X, y

### Create Model

In [7]:
kf = KFold(n_splits=3, shuffle=True, random_state=0)
scores = []

for train_index, test_index in kf.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    X_train, y_train = create_features(df=train)
    X_test, y_test = create_features(df=test)

    estimator = DecisionTreeClassifier(
        random_state=0,
        max_depth=15
    )

    model = BaggingClassifier(
        n_estimators=100,
        estimator=estimator,
        random_state=0
    )

    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    scores.append(score)

### Test

In [8]:
np.average(scores)

0.9953069259836694