In [108]:
import sys
from pathlib import Path

sys.path.append(str(Path(sys.argv[0]).absolute().parent.parent.parent.parent))
# add the entire folder to path
from src.utils import utils


In [109]:
import pandas as pd

df = pd.read_parquet("../data/2022-10-26_hiscore_data.parquet.gzip")
df.shape

(223558, 98)

# optimizing dataframe

In [110]:
import numpy as np

df["total"] = df["total"].astype("int32")
df[utils.SKILLS] = df[utils.SKILLS].astype("int32")
df[utils.MINIGAMES] = df[utils.MINIGAMES].astype("int16")
df[utils.BOSSES] = df[utils.BOSSES].astype("int16")


In [111]:
pd.DataFrame(df.label.value_counts())


Unnamed: 0,label
Real_Player,66931
Unknown_bot,60888
PVM_Melee_bot,18978
Magic_bot,10628
Smithing_bot,10542
Fishing_bot,6732
Crafting_bot,6075
Zulrah_bot,4922
Mining_bot,4872
Fletching_bot,4679


In [112]:
common_labels = (
    pd.DataFrame(df.label.value_counts()).query("label > 200").index.to_list()
)
mask = df.label.isin(common_labels)
df = df[mask].copy()
df.shape


(223170, 98)

In [113]:
hiscore_ratio = utils.get_ratio(df, COLUMNS=utils.SKILLS, total_column='total')
df[hiscore_ratio.columns] = hiscore_ratio

boss_ratio = utils.get_ratio(df, COLUMNS=utils.BOSSES, total_column='boss_total')
df[boss_ratio.columns] = boss_ratio

minigame_ratio = utils.get_ratio(df, COLUMNS=utils.MINIGAMES, total_column='minigame_total')
df[minigame_ratio.columns] = minigame_ratio

df.fillna(0, inplace=True)

model_columns = utils.HISCORE_COLUMNS + hiscore_ratio.columns.to_list() + boss_ratio.columns.to_list() + minigame_ratio.columns.to_list()

df.head()

Unnamed: 0,name,created_at,updated_at,possible_ban,confirmed_ban,label_id,label,account_status,id,timestamp,...,cs_all_ratio,cs_beginner_ratio,cs_easy_ratio,cs_medium_ratio,cs_hard_ratio,cs_elite_ratio,cs_master_ratio,lms_rank_ratio,soul_wars_zeal_ratio,minigame_total
0,3BA604236FB0319D5937E31388B0C64C,2021-03-14 20:22:45,2022-10-26 01:01:21,0,0,1,Real_Player,not banned,59568395,2022-10-26 01:04:13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,5A02B5A7F38AD2623A9C5E68DF01EC2F,2021-03-14 20:42:37,2022-10-26 08:11:05,0,0,1,Real_Player,not banned,59622273,2022-10-26 08:16:02,...,0.077682,0.0,0.002466,0.07275,0.001233,0.001233,0.0,0.32984,0.514797,1622
2,E666957B20A95519E6306D75FEC4DE19,2021-03-14 22:17:40,2022-10-26 04:14:28,1,0,1,Real_Player,not banned,59615490,2022-07-27 06:41:21,...,0.5,0.02322,0.065015,0.083591,0.094427,0.196594,0.037152,0.0,0.0,646
3,02726CE8822537806154B747927223D0,2021-03-14 22:18:40,2022-10-26 01:34:01,0,0,1,Real_Player,not banned,58905027,2022-10-26 01:34:05,...,0.5,0.117647,0.088235,0.117647,0.147059,0.029412,0.0,0.0,0.0,34
4,A5ECC15B4DFBFCAEF1522D4D78150146,2021-03-14 22:19:32,2022-10-26 15:53:38,0,0,1,Real_Player,not banned,59215937,2022-10-26 15:53:45,...,0.5,0.0,0.0,0.0,0.425,0.0,0.075,0.0,0.0,40


In [114]:
# TODO: fix bugged min total
df.describe()

Unnamed: 0,possible_ban,confirmed_ban,label_id,id,Player_id,total,attack,defence,strength,hitpoints,...,cs_all_ratio,cs_beginner_ratio,cs_easy_ratio,cs_medium_ratio,cs_hard_ratio,cs_elite_ratio,cs_master_ratio,lms_rank_ratio,soul_wars_zeal_ratio,minigame_total
count,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,...,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0,223170.0
mean,0.718753,0.700591,36.180239,126013500.0,8003295.0,56970300.0,3701433.0,3229084.0,5573647.0,7167439.0,...,0.109763,0.037154,0.026998,0.033353,0.030415,0.006735,0.004059,0.064733,0.037351,400.124013
std,0.449609,0.458,37.236914,130987500.0,13903550.0,130773600.0,9104107.0,7791447.0,14820200.0,17033020.0,...,0.35698,0.142911,0.178232,0.112109,0.103167,0.036872,0.02587,0.223635,0.604855,1894.178755
min,0.0,0.0,1.0,12773220.0,1.0,-2146505000.0,0.0,0.0,0.0,0.0,...,-29.020721,-0.41209,-0.596424,-0.421817,-0.308216,-0.085143,-0.104025,-1.584653,-273.883929,-60668.0
25%,0.0,0.0,1.0,58959410.0,779457.2,1641676.0,4472.0,934.5,4164.0,8033.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,17.0,59787960.0,4062978.0,8209836.0,131298.0,121449.0,186432.5,374554.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,89.0,157600800.0,5425230.0,42838580.0,2159245.0,2075609.0,4161669.0,5207320.0,...,0.063403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0
max,1.0,1.0,91.0,861219300.0,116068100.0,2122036000.0,200000000.0,200000000.0,200000000.0,200000000.0,...,137.357143,6.741071,68.883929,22.232143,24.571429,9.035714,5.892857,1.064021,4.630481,65168.0


In [115]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df["label"])
df_train.shape

(178536, 185)

In [116]:
x = df_train[model_columns]
y = df_train["label"]


# model selection

In [117]:
from sklearn.ensemble import RandomForestClassifier
# lets test a model
model = RandomForestClassifier(
    random_state=42, verbose=1, n_jobs=-1, n_estimators=10
)
model.fit(x,y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    6.8s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   12.9s finished


In [118]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 1,
 'warm_start': False}

In [119]:
from sklearn.metrics import classification_report

x = df_test[model_columns]
y = df_test["label"]

print(
    classification_report(
        y_true=y, 
        y_pred=model.predict(x)
    )
)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.1s finished


                      precision    recall  f1-score   support

         Agility_bot       0.98      0.99      0.99       684
         Barrows_bot       0.95      0.80      0.87       249
      Blast_mine_bot       0.80      0.57      0.67       239
         Cooking_bot       0.96      0.96      0.96       242
        Crafting_bot       0.99      1.00      0.99      1215
         Fishing_bot       0.99      0.99      0.99      1346
       Fletching_bot       0.99      0.99      0.99       936
        Herblore_bot       0.99      0.99      0.99       351
          Hunter_bot       0.99      1.00      0.99       864
             LMS_bot       0.93      0.82      0.87       407
           Magic_bot       0.97      0.99      0.98      2126
          Mining_bot       0.94      0.95      0.94       974
       PVM_Melee_bot       0.97      0.97      0.97      3796
PVM_Ranged_Magic_bot       0.85      0.61      0.71        56
      PVM_Ranged_bot       0.88      0.59      0.70        87
       

# model tuning

In [120]:
import pandas as pd

RANDOM_STATE = 42


In [121]:
from sklearn.model_selection import GridSearchCV

def test_model(model, cv, param: dict, x, y):
    model_grid_search = GridSearchCV(
        model, param_grid=param, cv=cv, scoring="f1_macro", n_jobs=-1, verbose=2
    )
    return model_grid_search.fit(x, y)


In [123]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit

cv = StratifiedShuffleSplit(n_splits=4, test_size=0.3, random_state=RANDOM_STATE)

cv.get_n_splits(x, y)

models = [
    {
        "model": DecisionTreeClassifier(random_state=RANDOM_STATE),
        "param": {
            "max_depth": (100, 250, 500),
        },
    },
    {
        "model": RandomForestClassifier(
            random_state=RANDOM_STATE, verbose=1, n_jobs=-1
        ),
        "param": {"n_estimators": (10, 100, 200)},
    },
    {
        "model": AdaBoostClassifier(
            random_state=RANDOM_STATE,
            base_estimator=DecisionTreeClassifier(
                max_depth=3
            ),
        ),
        "param": {
            "n_estimators": (10, 20, 50),
        },
    },
    {
        "model": GradientBoostingClassifier(
            random_state=RANDOM_STATE, 
            verbose=1, 
        ),
        "param": {"n_estimators": (100, 250, 500)},
    },
]


In [124]:
results = None
searches = dict()
for m in models:
    model = m.get("model")
    param = m.get("param")

    print(model.__class__.__name__)

    grid_search = test_model(model, cv, param, x, y)

    grid_result = pd.DataFrame(grid_search.cv_results_)
    grid_result.sort_values(by="rank_test_score")
    grid_result['model'] = model.__class__.__name__
    results = pd.concat([results, grid_result])
 
    mask = grid_result["rank_test_score"] == 1
    grid_result[mask].to_csv(f"./test_results/{model.__class__.__name__}_grid.csv")
    grid_best = grid_result[mask].to_dict(orient="records")[0]

    
    print(f"The mean fit time is: {grid_best['mean_fit_time']:.2f} seconds")
    print(
        f"The mean cross-validated testing score is: {grid_best['mean_test_score']:.2f} %"
    )
    print(
        f"The standard deviation of the testing score is: {grid_best['std_test_score']:.2f}"
    )
    searches[model.__class__.__name__] = {
        "model": grid_search.best_estimator_,
        "params": grid_search.best_params_,
        "score": grid_search.best_score_
    }

DecisionTreeClassifier
Fitting 4 folds for each of 3 candidates, totalling 12 fits
The mean fit time is: 20.90 seconds
The mean cross-validated testing score is: 0.87 %
The standard deviation of the testing score is: 0.00
RandomForestClassifier
Fitting 4 folds for each of 3 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   31.6s finished


The mean fit time is: 157.91 seconds
The mean cross-validated testing score is: 0.89 %
The standard deviation of the testing score is: 0.01
AdaBoostClassifier
Fitting 4 folds for each of 3 candidates, totalling 12 fits
The mean fit time is: 1063.94 seconds
The mean cross-validated testing score is: 0.32 %
The standard deviation of the testing score is: 0.06
GradientBoostingClassifier
Fitting 4 folds for each of 3 candidates, totalling 12 fits


In [None]:
grid_result.sort_values(by="rank_test_score")


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,model
2,1164.315218,4.878989,9.524726,0.481169,500,{'n_estimators': 500},0.852254,0.858869,0.875125,0.866579,0.863207,0.008547,1,AdaBoostClassifier
1,760.11218,2.968938,13.422978,1.010765,250,{'n_estimators': 250},0.846994,0.844905,0.835003,0.844638,0.842885,0.004641,2,AdaBoostClassifier
0,278.997896,1.033443,4.043494,0.286727,100,{'n_estimators': 100},0.791951,0.819838,0.798671,0.824001,0.808615,0.013595,3,AdaBoostClassifier


In [None]:
results.sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,model,param_n_estimators
2,1164.315218,4.878989,9.524726,0.481169,,{'n_estimators': 500},0.852254,0.858869,0.875125,0.866579,0.863207,0.008547,1,AdaBoostClassifier,500.0
1,760.11218,2.968938,13.422978,1.010765,,{'n_estimators': 250},0.846994,0.844905,0.835003,0.844638,0.842885,0.004641,2,AdaBoostClassifier,250.0
2,127.159224,0.998245,3.872351,0.817372,,{'n_estimators': 500},0.834468,0.829888,0.842031,0.843225,0.837403,0.005486,1,RandomForestClassifier,500.0
0,34.652198,2.575229,6.171009,0.669631,,{'n_estimators': 100},0.838346,0.829875,0.846699,0.834105,0.837256,0.00622,2,RandomForestClassifier,100.0
1,89.201466,0.70265,8.341485,0.616288,,{'n_estimators': 250},0.830868,0.829417,0.846726,0.840012,0.836756,0.007045,3,RandomForestClassifier,250.0
0,278.997896,1.033443,4.043494,0.286727,,{'n_estimators': 100},0.791951,0.819838,0.798671,0.824001,0.808615,0.013595,3,AdaBoostClassifier,100.0
0,7.487934,0.579557,0.25059,0.01652,100.0,{'max_depth': 100},0.798826,0.804484,0.809376,0.817342,0.807507,0.006796,1,DecisionTreeClassifier,
1,7.277703,0.419626,0.275345,0.019618,250.0,{'max_depth': 250},0.798826,0.804484,0.809376,0.817342,0.807507,0.006796,1,DecisionTreeClassifier,
2,6.671303,0.114542,0.318535,0.066479,500.0,{'max_depth': 500},0.798826,0.804484,0.809376,0.817342,0.807507,0.006796,1,DecisionTreeClassifier,


# model evaluation

In [None]:
from pprint import pprint
pprint(searches)

{'AdaBoostClassifier': {'model': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10,
                                                         random_state=42),
                   n_estimators=500, random_state=42),
                        'params': {'n_estimators': 500},
                        'score': 0.8632069536381353},
 'DecisionTreeClassifier': {'model': DecisionTreeClassifier(max_depth=100, random_state=42),
                            'params': {'max_depth': 100},
                            'score': 0.8075069090439844},
 'RandomForestClassifier': {'model': RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42, verbose=2),
                            'params': {'n_estimators': 500},
                            'score': 0.8374030842871327}}


In [None]:
model = searches.get('RandomForestClassifier').get('model')
model = model.fit(x,y)

building tree 488 of 500
building tree 489 of 500
building tree 490 of 500
building tree 491 of 500
building tree 492 of 500
building tree 493 of 500
building tree 494 of 500
building tree 495 of 500
building tree 496 of 500
building tree 497 of 500
building tree 498 of 500
building tree 499 of 500
building tree 500 of 500


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.1min finished


In [None]:
x = df_test[model_columns]
y = df_test["label"]

print(
    classification_report(
        y_true=y, 
        y_pred=model.predict(x)
    )
)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    3.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    5.3s finished


                      precision    recall  f1-score   support

         Unknown_bot       0.99      0.98      0.98       684
         Real_Player       0.97      0.62      0.76       249
         Agility_bot       0.84      0.58      0.69       239
         Zalcano_bot       0.97      0.88      0.92       242
          Hunter_bot       0.99      0.99      0.99      1215
       Fletching_bot       0.98      0.98      0.98      1346
       Soul_Wars_bot       0.98      0.97      0.98       936
             LMS_bot       1.00      0.98      0.99       351
         Fishing_bot       0.99      0.98      0.98       864
        Thieving_bot       0.94      0.77      0.85       407
PVM_Ranged_Magic_bot       0.98      0.97      0.97      2126
       PVM_Melee_bot       0.94      0.92      0.93       974
        Crafting_bot       0.97      0.96      0.96      3796
        Smithing_bot       0.98      0.71      0.82        56
          Mining_bot       0.97      0.32      0.48        87
       