In [None]:
import sys
import os
from pathlib import Path
# print(os.path.realpath(__file__))
sys.path.append(str(Path(sys.argv[0]).absolute().parent.parent.parent.parent))
base_repo = os.path.realpath(os.path.join(os.getcwd(), "../../"))
print(base_repo)
sys.path.append(base_repo)
# add the entire folder to path
print(sys.path)
print(os.getcwd())
from src.utils import utils
from src.discovery import data
from src.discovery import classifier

import collections
import os.path

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import tree

import graphviz 

In [None]:
OUTPUT_DIR = os.path.realpath('./output/')
PATH_TO_CSV = os.path.realpath('../data/raw/2022-10-26_hiscore_data.csv')
if PATH_TO_CSV is None or not os.path.exists(PATH_TO_CSV):
    print('set PATH_TO_CSV=/path/to/2022-10-26_hiscore_data.csv and run all')
    raise ValueError

df = pd.read_csv(PATH_TO_CSV)

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.chdir(OUTPUT_DIR)

In [None]:
# This logic is roughly equivalent to app.py's train function
hiscoredata = data.hiscoreData(df, False)
features = hiscoredata.features()
# no playerData()-related data, its already in features
features_labeled = features
binary_classifier = classifier.classifier("binaryClassifier")
dt_binary_classifier = classifier.DTclassifier("DTbinaryClassifier")
# Logic should be the same as app.py line 179: train the model 
x = features_labeled.copy()
y = x['label'].apply(lambda x: 0 if x == 'Real_Player' else 1)
x.drop(columns=['label', 'label_id', 'name', 'created_at', 'updated_at', 'account_status', 'possible_ban', 'confirmed_ban'], inplace=True)
print('x columns:\n', x.columns)
print('x head:\n', x.head())
print('y head:\n', y.head())
x.describe()


In [None]:
def fit_score(classifier, x, y, class_names=None, rpt=True):
    # This logic is roughly equivalent to app.py's train function
    train_x, test_x, train_y, test_y = train_test_split(
        x, y, test_size=0.2, random_state=42, stratify=y
    )
    classifier.fit(train_x, train_y)

    accuracy, roc_auc = classifier.score(test_y, test_x, class_names)
    if rpt:
        print(f'accuracy: {accuracy}, roc_auc: {roc_auc}')
    # OUTPUT: (0.9992126580557206, 0.9992126580557205)
    
    if rpt:
        print(
            classification_report(
                y_true=test_y, 
                y_pred=classifier.predict(test_x)
            )
        )
    return accuracy

# To inspect binary classifier random forest and decision tree performance:
# fit_score(binary_classifier, x, y)
# fit_score(dt_binary_classifier, x, y)

In [None]:
# Output of block is x_multi, y_multi. Fiddle around with data here.
multi_classifier = classifier.classifier("multiClassifier")
dt_multi_classifier = classifier.DTclassifier("DTmultiClassifier")
y_labels = features_labeled['label'].value_counts()
y_labels = [label for label, value in y_labels.items() if value > 500]
# To inspect just bots:
# y_labels.remove('Real_Player')
x_multi = features_labeled.copy()
x_multi = x_multi[x_multi['label'].isin(y_labels)]
label_to_id = dict((label, label_id) for label_id, label in enumerate(x_multi['label'].unique()))
y_multi = x_multi['label'] #.apply(lambda x: label_to_id[x])
x_multi.drop(columns=['label', 'label_id', 'name', 'created_at', 'updated_at', 'account_status', 'possible_ban', 'confirmed_ban'], inplace=True)

enum_label = list(enumerate(y_multi.unique()))
class_names = [t[1] for t in enum_label]
label_to_enum = dict([(k, v) for v, k in enum_label])

x_multi = x_multi[x_multi.columns.drop(list(x_multi.filter(regex='/total')))]

print('x_multi columns:\n', x_multi.columns)
print('x_multi head:\n', x_multi.head())
print('y_multi head:\n', y_multi.head())
# To inspect multiclassifier random forest and decision tree performance:
#fit_score(multi_classifier, x_multi, y_multi)
#fit_score(dt_multi_classifier, x_multi, y_multi)


In [None]:
# Train a decision tree and output info about the set of samples that end up in each leaf node.
# Loosely based on: https://stackoverflow.com/a/66299085/5024503
def retrieve_branches(number_nodes, children_left_list, children_right_list):
    """Retrieve decision tree branches"""
    
    # Calculate if a node is a leaf
    is_leaves_list = [(False if cl != cr else True) for cl, cr in zip(children_left_list, children_right_list)]
    
    # Store the branches paths
    paths = []
    
    for i in range(number_nodes):
        if is_leaves_list[i]:
            # Search leaf node in previous paths
            end_node = [path[-1] for path in paths]

            # If it is a leave node yield the path
            if i in end_node:
                output = paths.pop(np.argwhere(i == np.array(end_node))[0][0])
                yield output

        else:
            
            # Origin and end nodes
            origin, end_l, end_r = i, children_left_list[i], children_right_list[i]

            # Iterate over previous paths to add nodes
            for index, path in enumerate(paths):
                if origin == path[-1]:
                    paths[index] = path + [end_l]
                    paths.append(path + [end_r])

            # Initialize path in first iteration
            if i == 0:
                paths.append([i, children_left[i]])
                paths.append([i, children_right[i]])

def print_node_info(label, node_info, feature, threshold, y_multi, col_names, x_multi, samp_to_leaf, label_i):
    labels = sorted(y_multi.unique())
    leaf_index, branch_idx, branch, impurity, nsamples, ntotal, value = node_info
    indent = '\t\t\t'
    pct_samples = 100.0 * float(nsamples)/float(ntotal)
    
    file_stem = f'{label}-{label_i}'
    with open(f'{file_stem}.txt', 'w') as f: 
        f.write(f"---------------------------------------------------------------------------------------\n")
        f.write(f'{indent}Branch: {branch_idx}, Path: {branch}\n')
        f.write(f'{indent}{nsamples} in node ({pct_samples}%)\n')
        f.write(f'{indent}Gin {impurity} at leaf node {branch[-1]}\n')
        label_vals = sorted(zip(labels, value), key=lambda tup: tup[1], reverse=True)
        weights_str = ' '.join(f'{l}:{int(v)}' for l, v in label_vals)
        f.write(f'{indent}Value: {weights_str}\n')
        l = []
        # Use a feature vector that falls in this leaf_index to determine '<=' or '>'
        samp_idxs_in_leaf = samp_to_leaf.index[samp_to_leaf == leaf_index]
        samp_idx_in_leaf = samp_idxs_in_leaf[0]
        samp_in_leaf = x_multi.iloc[samp_idx_in_leaf]
        samp_idxs_df = x_multi.iloc[samp_idxs_in_leaf]
        out_df = samp_idxs_df.copy()
        out_df.to_csv(f'{file_stem}.csv')
        for elem in branch:
            op = '<=' if samp_in_leaf[feature[elem]] <= threshold[elem] else '>' 
            lvl = '' if col_names[feature[elem]].lower() not in utils.SKILLS else (f'(lvl {utils.XPTable.exp_to_level(threshold[elem])})')
            s = f'{col_names[feature[elem]]} {op} {threshold[elem]} {lvl}'
            l.append(s)
        f.write(f"{indent}Decision Rules: " + ', '.join(l) + '\n')
        if pct_samples > 0.5 and label == 'Unknown_bot':
            # Compute some stats for this label's common leaf nodes for convenience.
            #   You can do this after the .csv files are output, too.
            f.write(f"{indent}{out_df.describe()}\n")
        f.write(f"---------------------------------------------------------------------------------------\n")

# above ~10 is too deep to graph, so print helpful output instead.
TREE_DEPTH=19
print(f'##############################################################')
print(f'### Training decision tree with depth {TREE_DEPTH}.#####################')
print(f'### Generating a label-id.csv file for each leaf node, where #')
print(f'### label is the tree\'s classification at that node, and #####')
print(f'### id is unique for each label. id 0 is the leaf containing #')
print(f'### the most samples, 1 the next most, etc... ################')
print(f'### Also generating corresponding label-id.txt files, ########')
print(f'### containing descriptive info about the .csv file ##########')
print(f'##############################################################')
clf = classifier.DTclassifier(f"DTmultiClassifier{TREE_DEPTH}", max_depth=TREE_DEPTH)
score = fit_score(clf, x_multi, y_multi, rpt=True)
col_names = x_multi.columns.tolist()

n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
impurity = clf.tree_.impurity
value = clf.tree_.value
all_branches = list(retrieve_branches(n_nodes, children_left, children_right))
label_to_node_info = collections.defaultdict(list)
samp_to_leaf = pd.Series(clf.apply(x_multi))
leaf_idxs = pd.Series(samp_to_leaf).value_counts()
ntotal = leaf_idxs.sum()
for index, branch in enumerate(all_branches):
    leaf_index = branch[-1]
    nsamples = leaf_idxs[leaf_index]
    node_info = (leaf_index, index, branch, impurity[leaf_index], nsamples, ntotal, value[leaf_index][0])
    label = clf.classes_[np.argmax(value[leaf_index])]
    label_to_node_info[label].append(node_info)
labels = list(label_to_node_info.keys())
for label, v in label_to_node_info.items():
    print('###################################################')
    print('### Generating label files for: ', label, '########')
    print('###################################################')
    for i, node_info in enumerate(sorted(v, key=lambda tup: tup[4], reverse=True)):
        print_node_info(label, node_info, feature, threshold, y_multi, col_names, x_multi, samp_to_leaf, i)



In [None]:
s = tree.export_text(clf, feature_names=x_multi.columns.tolist(), max_depth=100, show_weights=True, spacing=1)
with open('inspect_dt-tree.txt', 'w') as f:
    f.write(s)

In [None]:
# Graph some decision trees with lower depths.
PLOT_TREES = False
if PLOT_TREES:
    print('Generating small decision tree images (large ones dont display well)')
    scores = []
    i_vals = []
    for i in range(2, 10):
        dt_multi_classifier = classifier.DTclassifier(f"DTmultiClassifier{i}", max_depth=i)
        score = fit_score(dt_multi_classifier, x_multi, y_multi)
        scores.append(score)
        i_vals.append(i_vals)
        out_file = f'DTmultiClassifer_{i}'
        dot_data = tree.export_graphviz(dt_multi_classifier, rotate=True, impurity=False, precision=1, feature_names=x_multi.columns, class_names=sorted(y_multi.unique()))
        graph = graphviz.Source(dot_data) 
        graph.render(filename=out_file, format='pdf')

    
