In [6]:
import sys
from pathlib import Path

sys.path.append(str(Path(sys.argv[0]).absolute().parent.parent.parent.parent))
# add the entire folder to path
from src.utils import utils


In [45]:
import pandas as pd

df = pd.read_parquet("../data/2022-10-26_hiscore_data.parquet.gzip")
df.shape


(223558, 98)

# optimizing dataframe

In [46]:
import numpy as np

df["total"] = df["total"].astype("int32")
df[utils.SKILLS] = df[utils.SKILLS].astype("int32")
df[utils.MINIGAMES] = df[utils.MINIGAMES].astype("int16")
df[utils.BOSSES] = df[utils.BOSSES].astype("int16")


In [47]:
pd.DataFrame(df.label.value_counts())


Unnamed: 0,label
Real_Player,66931
Unknown_bot,60888
PVM_Melee_bot,18978
Magic_bot,10628
Smithing_bot,10542
Fishing_bot,6732
Crafting_bot,6075
Zulrah_bot,4922
Mining_bot,4872
Fletching_bot,4679


In [48]:
common_labels = (
    pd.DataFrame(df.label.value_counts()).query("label > 200").index.to_list()
)
mask = df.label.isin(common_labels)
df = df[mask].copy()
df.shape


(223170, 98)

In [49]:
x = df[utils.HISCORE_COLUMNS]
y = df["label"]


# model selection

# model tuning

In [76]:

from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
import pandas as pd

RANDOM_STATE = 42


In [77]:
from sklearn.model_selection import GridSearchCV

def test_model(model, cv, param: dict, x, y):
    model_grid_search = GridSearchCV(
        model, param_grid=param, cv=cv, scoring="f1_macro", n_jobs=-1, verbose=2
    )
    return model_grid_search.fit(x, y)


In [84]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit

cv = StratifiedShuffleSplit(
    n_splits=4, 
    test_size=0.3, 
    random_state=RANDOM_STATE
)

cv.get_n_splits(x, y)

models = [
    {
        "model": DecisionTreeClassifier(random_state=RANDOM_STATE),
        "param": {
            'max_depth': (100,250,500),
            'min_samples_leaf': (1, 50, 100),
        }
    }
]

In [87]:
for m in models:
    model = m.get("model")
    param = m.get("param")

    grid_search = test_model(model, cv, param, x, y)
    grid_result = pd.DataFrame(grid_search.cv_results_)
    grid_result.sort_values(by="rank_test_score")

    mask = (grid_result["rank_test_score"] == 1)
    grid_result[mask].to_csv(f"./test_results/{model.__class__.__name__}_grid.csv")
    grid_best = grid_result[mask].to_dict(orient="records")[0]
    
    print(model.__class__.__name__)
    print(
        f"The mean fit time is: {grid_best['mean_fit_time']:.2f} seconds"
    )
    print(
        f"The mean cross-validated testing error is: {grid_best['mean_test_score']:.2f} %"
    )
    print(
        f"The standard deviation of the testing error is: {grid_best['std_test_score']:.2f}"
    )

Fitting 4 folds for each of 9 candidates, totalling 36 fits
DecisionTreeClassifier
The mean fit time is: 57.35 seconds
The mean cross-validated testing error is: 0.85 %
The standard deviation of the testing error is: 0.00


In [88]:
grid_result.sort_values(by="rank_test_score")


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,57.350133,3.850912,2.112504,0.508674,100,1,"{'max_depth': 100, 'min_samples_leaf': 1}",0.852631,0.85328,0.852694,0.853835,0.85311,0.000489,1
3,61.974783,3.043253,1.848095,0.439878,250,1,"{'max_depth': 250, 'min_samples_leaf': 1}",0.852631,0.85328,0.852694,0.853835,0.85311,0.000489,1
6,51.14392,1.428812,1.280467,0.254663,500,1,"{'max_depth': 500, 'min_samples_leaf': 1}",0.852631,0.85328,0.852694,0.853835,0.85311,0.000489,1
1,45.716262,3.52247,1.573004,0.508193,100,50,"{'max_depth': 100, 'min_samples_leaf': 50}",0.807001,0.816691,0.817353,0.815321,0.814091,0.004159,4
4,41.501022,2.652856,1.185925,0.148296,250,50,"{'max_depth': 250, 'min_samples_leaf': 50}",0.807001,0.816691,0.817353,0.815321,0.814091,0.004159,4
7,37.627114,2.856728,1.238924,0.13199,500,50,"{'max_depth': 500, 'min_samples_leaf': 50}",0.807001,0.816691,0.817353,0.815321,0.814091,0.004159,4
2,38.924428,2.919479,1.407775,0.332851,100,100,"{'max_depth': 100, 'min_samples_leaf': 100}",0.771456,0.78029,0.794478,0.787903,0.783532,0.008592,7
5,33.146182,3.399866,1.48344,0.505595,250,100,"{'max_depth': 250, 'min_samples_leaf': 100}",0.771456,0.78029,0.794478,0.787903,0.783532,0.008592,7
8,28.305685,0.704578,0.819779,0.088833,500,100,"{'max_depth': 500, 'min_samples_leaf': 100}",0.771456,0.78029,0.794478,0.787903,0.783532,0.008592,7


# model evaluation