In [15]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv("kr-vs-kp.csv")

### Train / Test Split

In [3]:
def create_features(df):
    df = df.copy()

    categories = [
        'bkblk','bknwy','bkon8','bkona','bkspr','bkxbq',
        'bkxcr','bkxwp','blxwp','bxqsq','cntxt','dsopp',
        'dwipd','hdchk','katri','mulch','qxmsq','r2ar8',
        'reskd','reskr','rimmx','rkxwp','rxmsq','simpl',
        'skach','skewr','skrxp','spcop','stlmt','thrsk',
        'wkcti','wkna8','wknck','wkovl','wkpos','wtoeg', 
        "class"
    ]

    for category in categories:
        le = LabelEncoder()
        df[category] = le.fit_transform(df[category])
    
    X = df.drop("class", axis=1)
    y = df["class"]

    return X, y

### Create Model

In [51]:
kf = KFold(n_splits=10, shuffle=True, random_state=0)
scores = []

for train_index, test_index in kf.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    X_train, y_train = create_features(df=train)
    X_test, y_test = create_features(df=test)

    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=15,
        max_features="log2",
        random_state=0,
        oob_score=True,
        verbose=1
    )

    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    print(score)
    scores.append(score)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.98125


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.99375
0.996875


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.996875


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.9625


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.9875
0.9843260188087775


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.9905956112852664


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


0.9937304075235109
0.9905956112852664


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


### Test

In [52]:
np.average(scores)

0.9877997648902822

### Size

In [36]:
import pickle
import sys

p = pickle.dumps(model)
print(f"{sys.getsizeof(p)} bytes")

276057 bytes
