In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv("kr-vs-kp.csv")

### Train / Test Split

In [3]:
def create_features(df):
    df = df.copy()

    categories = [
        'bkblk','bknwy','bkon8','bkona','bkspr','bkxbq',
        'bkxcr','bkxwp','blxwp','bxqsq','cntxt','dsopp',
        'dwipd','hdchk','katri','mulch','qxmsq','r2ar8',
        'reskd','reskr','rimmx','rkxwp','rxmsq','simpl',
        'skach','skewr','skrxp','spcop','stlmt','thrsk',
        'wkcti','wkna8','wknck','wkovl','wkpos','wtoeg', 
        "class"
    ]

    for category in categories:
        le = LabelEncoder()
        df[category] = le.fit_transform(df[category])
    
    X = df.drop("class", axis=1)
    y = df["class"]

    return X, y

### Create Model

In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
scores = []

for train_index, test_index in kf.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    X_train, y_train = create_features(df=train)
    X_test, y_test = create_features(df=test)

    model = DecisionTreeClassifier(
        random_state=0,
        max_depth=15
    )

    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    scores.append(score)

### Evaluate

In [5]:
np.average(scores)

0.995619131455399

In [6]:
print(model.feature_importances_)
print(np.argmax(model.feature_importances_))

[3.44682784e-02 4.57322059e-04 5.29878701e-03 7.26332639e-03
 0.00000000e+00 3.78396223e-02 7.29310723e-03 0.00000000e+00
 0.00000000e+00 2.49974001e-01 1.77385526e-03 1.07742372e-04
 1.25596904e-03 8.57429650e-03 2.77641109e-02 9.64296226e-03
 6.82302513e-03 7.48069705e-03 8.36246050e-04 6.00156393e-03
 1.99753973e-01 3.64852543e-03 5.12772039e-03 1.56796134e-03
 2.77621391e-03 1.17597101e-03 4.08364932e-03 1.52184483e-03
 0.00000000e+00 3.96460263e-03 9.28558926e-04 1.01540864e-01
 2.37584161e-01 7.37655549e-04 2.24533925e-02 2.79993097e-04]
9
