In [None]:
import sys
import os
from pathlib import Path

sys.path.append(str(Path(sys.argv[0]).absolute().parent.parent.parent.parent))
base_repo = os.path.realpath(os.path.join(os.getcwd(), "../../"))
print(f"{base_repo=}")

sys.path.append(base_repo)

# add the entire folder to path
print(f"{sys.path=}")
print(f"{os.getcwd()=}")

In [None]:
from src.utils import utils
from src.utils import data
from src.utils import classifier

import collections
import os.path

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import tree

import graphviz

In [None]:
PATH_TO_FILE = f"../data/raw/2022-10-26_hiscore_data.parquet.gzip"

df = pd.read_parquet(PATH_TO_FILE)
df = df.reset_index()
df.head()

In [None]:
OUTPUT_DIR = os.path.realpath("../data/output")
os.chdir(OUTPUT_DIR)

In [None]:
# This logic is roughly equivalent to app.py's train function
hiscoredata = data.hiscoreData(df, False)
features = hiscoredata.features()

In [None]:
binary_classifier = classifier.classifier("binaryClassifier")
dt_binary_classifier = classifier.DTclassifier("DTbinaryClassifier")

In [None]:
x = features.copy()
x = x.drop(
    columns=[
        "label",
        "label_id",
        "name",
        "created_at",
        "updated_at",
        "account_status",
        "possible_ban",
        "confirmed_ban",
    ]
)
print("x columns:\n", x.columns)
x.head()

In [None]:
x.describe()

In [None]:
y = features["label"].apply(lambda x: 0 if x == "Real_Player" else 1)
y.head()

In [None]:
def fit_score(classifier, x, y, class_names=None, rpt=True):
    # This logic is roughly equivalent to app.py's train function
    train_x, test_x, train_y, test_y = train_test_split(
        x, y, test_size=0.2, random_state=42, stratify=y
    )
    classifier.fit(train_x, train_y)

    accuracy, roc_auc = classifier.score(test_y, test_x, class_names)
    if rpt:
        print(f"accuracy: {accuracy}, roc_auc: {roc_auc}")
    # OUTPUT: (0.9992126580557206, 0.9992126580557205)

    if rpt:
        print(classification_report(y_true=test_y, y_pred=classifier.predict(test_x)))
    return accuracy


# To inspect binary classifier random forest and decision tree performance:
# fit_score(binary_classifier, x, y)
# fit_score(dt_binary_classifier, x, y)

In [None]:
# Output of block is x_multi, y_multi. Fiddle around with data here.
multi_classifier = classifier.classifier("multiClassifier")
dt_multi_classifier = classifier.DTclassifier("DTmultiClassifier")

In [None]:
y_labels = features["label"].value_counts()
y_labels = [label for label, value in y_labels.items() if value > 500]
y_labels

In [None]:
x_multi = features.copy()
x_multi = x_multi[x_multi["label"].isin(y_labels)]
x_multi.head()

In [None]:
label_to_id = dict(
    (label, label_id) for label_id, label in enumerate(x_multi["label"].unique())
)
label_to_id

In [None]:
y_multi = x_multi["label"]  # .apply(lambda x: label_to_id[x])
y_multi

In [None]:
enum_label = list(enumerate(y_multi.unique()))
class_names = [t[1] for t in enum_label]
label_to_enum = dict([(k, v) for v, k in enum_label])
label_to_enum

In [None]:
x_multi = x_multi.drop(
    columns=[
        "label",
        "label_id",
        "name",
        "created_at",
        "updated_at",
        "account_status",
        "possible_ban",
        "confirmed_ban",
    ]
)
x_multi = x_multi[x_multi.columns.drop(list(x_multi.filter(regex="/total")))]
x_multi

In [None]:
print("x_multi columns:\n", x_multi.columns)

In [None]:
# Train a decision tree and output info about the set of samples that end up in each leaf node.
# Loosely based on: https://stackoverflow.com/a/66299085/5024503
def retrieve_branches(number_nodes, children_left_list, children_right_list):
    """Retrieve decision tree branches"""

    # Calculate if a node is a leaf
    is_leaves_list = [
        (False if cl != cr else True)
        for cl, cr in zip(children_left_list, children_right_list)
    ]

    # Store the branches paths
    paths = []

    for i in range(number_nodes):
        if is_leaves_list[i]:
            # Search leaf node in previous paths
            end_node = [path[-1] for path in paths]

            # If it is a leave node yield the path
            if i in end_node:
                output = paths.pop(np.argwhere(i == np.array(end_node))[0][0])
                yield output

        else:
            # Origin and end nodes
            origin, end_l, end_r = i, children_left_list[i], children_right_list[i]

            # Iterate over previous paths to add nodes
            for index, path in enumerate(paths):
                if origin == path[-1]:
                    paths[index] = path + [end_l]
                    paths.append(path + [end_r])

            # Initialize path in first iteration
            if i == 0:
                paths.append([i, children_left[i]])
                paths.append([i, children_right[i]])

In [None]:
from src.utils.utils import XPTable

XPTable().df

In [None]:
from src.utils.utils import XPTable

XPTable().exp_to_level(5000)

In [None]:
from src.utils.utils import XPTable

def print_node_info(
    label,
    node_info,
    feature,
    threshold,
    y_multi,
    col_names,
    x_multi: pd.DataFrame,
    samp_to_leaf,
    label_i,
    min_samples=100,
):
    # Get unique labels in the target variable
    labels = sorted(y_multi.unique())

    # Unpack node information
    leaf_index, branch_idx, branch, impurity, n_samples, ntotal, value = node_info

    if n_samples < min_samples:
        return

    # Calculate the percentage of samples in this node
    pct_samples = 100.0 * float(n_samples) / float(ntotal)

    # Generate the file names
    file_stem = f"{label}-{label_i}"
    filename = f"{file_stem}.txt"

    # Create a delimiter for separating sections
    delimiter_text = "-" * 50 + "\n"

    # Sort labels based on values for printing purposes
    label_vals = sorted(zip(labels, value), key=lambda tup: tup[1], reverse=True)

    decision_rules = []

    # Find the sample indices that fall in this leaf node
    samp_idxs_in_leaf = samp_to_leaf.index[samp_to_leaf == leaf_index]
    samp_idx_in_leaf = samp_idxs_in_leaf[0]
    samp_in_leaf = x_multi.iloc[samp_idx_in_leaf]
    samp_idxs_df = x_multi.iloc[samp_idxs_in_leaf]
    out_df: pd.DataFrame = samp_idxs_df.copy()

    # Export the DataFrame with samples in this leaf node to a CSV file
    out_df.to_csv(f"{file_stem}-#{n_samples}.csv")

    xp_table = XPTable()

    # Construct decision rules based on the branch information
    for elem in branch:
        op = "<=" if samp_in_leaf[feature[elem]] <= threshold[elem] else ">"
        column_name = col_names[feature[elem]].lower()

        # print(elem, threshold[elem])
        if threshold[elem] >0:
            level = xp_table.exp_to_level(threshold[elem])
        else:
            level = 0
        
        lvl = "" if column_name not in utils.SKILLS else (f"(lvl {level})")
        s = f"{col_names[feature[elem]]} {op} {threshold[elem]} {lvl}"
        decision_rules.append(s)

    # Write information to the output file
    with open(filename, "w") as f:
        f.write(delimiter_text)
        f.write(f"Branch: {branch_idx}, Path: {branch}\n")
        f.write(f"\t{n_samples} in node ({pct_samples}%) (from training + test data)\n")
        f.write(f"\tGin {impurity} at leaf node {branch[-1]}\n")

        # Write values associated with each label (if value is not 0)
        f.write(f"\tValue (from training data):\n")
        for label, value in label_vals:
            if value == 0:
                continue
            f.write("\t\t" + f"{label}: {value}" + "\n")

        # Write decision rules for the branch
        f.write(f"\tDecision Rules:\n")
        for rule in decision_rules:
            f.write(f"\t\t" + str(rule) + "\n")

        # Check conditions and write additional statistics for 'Unknown_bot' label
        if label == "Unknown_bot":
            print(f"For {filename}: label == 'Unknown_bot'")
            print(f"Printing df.describe() in {filename}")
            f.write(f"\t{out_df.describe()}\n")
        f.write(delimiter_text)
    return

In [None]:
TREE_DEPTH = 20

print(f"##############################################################")
print(f"### Training decision tree with depth {TREE_DEPTH}.#####################")
print(f"### Generating a label-id.csv file for each leaf node, where #")
print(f"### label is the tree's classification at that node, and #####")
print(f"### id is unique for each label. id 0 is the leaf containing #")
print(f"### the most samples, 1 the next most, etc... ################")
print(f"### Also generating corresponding label-id.txt files, ########")
print(f"### containing descriptive info about the .csv file ##########")
print(f"##############################################################")

clf = classifier.DTclassifier(f"DTmultiClassifier{TREE_DEPTH}", max_depth=TREE_DEPTH)
score = fit_score(clf, x_multi, y_multi, rpt=True)

In [None]:
# above ~10 is too deep to graph, so print helpful output instead.
col_names = x_multi.columns.tolist()

n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
impurity = clf.tree_.impurity
value = clf.tree_.value
all_branches = list(retrieve_branches(n_nodes, children_left, children_right))
samp_to_leaf = pd.Series(clf.apply(x_multi))
leaf_idxs = pd.Series(samp_to_leaf).value_counts()
ntotal = leaf_idxs.sum()


def get_label_node_info(index, branch, leaf_idxs, impurity, ntotal, value, clf_classes):
    leaf_index = branch[-1]
    n_samples = leaf_idxs[leaf_index]
    node_info = (
        leaf_index,
        index,
        branch,
        impurity[leaf_index],
        n_samples,
        ntotal,
        value[leaf_index][0],
    )
    label = clf_classes[np.argmax(value[leaf_index])]
    return label, node_info


def get_label_to_node_info_dict(all_branches):
    label_to_node_info = collections.defaultdict(list)
    for index, branch in enumerate(all_branches):
        label, node_info = get_label_node_info(
            index, branch, leaf_idxs, impurity, ntotal, value, clf.classes_
        )
        label_to_node_info[label].append(node_info)
    return label_to_node_info


label_to_node_info = get_label_to_node_info_dict(all_branches)
labels = list(label_to_node_info.keys())


def generate_label_file(
    label, node_info, feature, threshold, y_multi, col_names, x_multi, samp_to_leaf
):
    print("###################################################")
    print("### Generating label files for: ", label, "########")
    print("###################################################")
    for i, node_info in enumerate(
        sorted(node_info, key=lambda tup: tup[4], reverse=True)
    ):
        print_node_info(
            label,
            node_info,
            feature,
            threshold,
            y_multi,
            col_names,
            x_multi,
            samp_to_leaf,
            i,
        )


def generate_label_files(
    label_to_node_info, feature, threshold, y_multi, col_names, x_multi, samp_to_leaf
):
    for label, node_info in label_to_node_info.items():
        generate_label_file(
            label,
            node_info,
            feature,
            threshold,
            y_multi,
            col_names,
            x_multi,
            samp_to_leaf,
        )


generate_label_files(
    label_to_node_info, feature, threshold, y_multi, col_names, x_multi, samp_to_leaf
)

In [None]:
s = tree.export_text(
    clf,
    feature_names=x_multi.columns.tolist(),
    max_depth=100,
    show_weights=True,
    spacing=1,
)
with open("inspect_dt-tree.txt", "w") as f:
    f.write(s)

In [None]:
# Graph some decision trees with lower depths.
PLOT_TREES = False
if PLOT_TREES:
    print("Generating small decision tree images (large ones dont display well)")
    scores = []
    i_vals = []
    for i in range(2, 10):
        dt_multi_classifier = classifier.DTclassifier(
            f"DTmultiClassifier{i}", max_depth=i
        )
        score = fit_score(dt_multi_classifier, x_multi, y_multi)
        scores.append(score)
        i_vals.append(i_vals)
        out_file = f"DTmultiClassifer_{i}"
        dot_data = tree.export_graphviz(
            dt_multi_classifier,
            rotate=True,
            impurity=False,
            precision=1,
            feature_names=x_multi.columns,
            class_names=sorted(y_multi.unique()),
        )
        graph = graphviz.Source(dot_data)
        graph.render(filename=out_file, format="pdf")

In [None]:
y_multi.value_counts() / len(y_multi)