In [91]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [92]:
df = pd.read_csv("kr-vs-kp.csv")

### Train / Test Split

In [93]:
def create_features(df):
    df = df.copy()

    categories = [
        'bkblk','bknwy','bkon8','bkona','bkspr','bkxbq',
        'bkxcr','bkxwp','blxwp','bxqsq','cntxt','dsopp',
        'dwipd','hdchk','katri','mulch','qxmsq','r2ar8',
        'reskd','reskr','rimmx','rkxwp','rxmsq','simpl',
        'skach','skewr','skrxp','spcop','stlmt','thrsk',
        'wkcti','wkna8','wknck','wkovl','wkpos','wtoeg', 
        "class"
    ]

    for category in categories:
        le = LabelEncoder()
        df[category] = le.fit_transform(df[category])
    
    X = df.drop("class", axis=1)
    y = df["class"]

    return X, y

### Create Model

In [105]:
kf = KFold(n_splits=3, shuffle=True, random_state=0)
scores = []

for train_index, test_index in kf.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    X_train, y_train = create_features(df=train)
    X_test, y_test = create_features(df=test)

    model = DecisionTreeClassifier(
        random_state=0,
        max_depth=15
    )

    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    scores.append(score)

### Evaluate

In [106]:
np.average(scores)

0.9943679588475192

In [97]:
print(model.feature_importances_)
print(np.argmax(model.feature_importances_))

[3.64819588e-02 7.20808397e-04 4.73820870e-03 6.96184952e-03
 0.00000000e+00 3.82933985e-02 7.20518354e-03 0.00000000e+00
 0.00000000e+00 2.55207011e-01 2.65343179e-03 8.29683934e-05
 1.09774102e-03 7.88926666e-03 2.66594383e-02 8.29981250e-03
 7.29802804e-03 5.72009897e-03 0.00000000e+00 6.47728540e-03
 1.99599800e-01 3.32631586e-03 5.63343133e-03 2.38808861e-03
 2.09935333e-03 2.65343179e-04 4.27133814e-03 1.35822540e-03
 0.00000000e+00 5.44850236e-03 6.63357947e-04 1.01059269e-01
 2.35739347e-01 1.50888216e-03 2.07306403e-02 1.21615624e-04]
9
