In [None]:
# Ensure RAPIDS env is set up properly
import cudf
import cuml
print(cudf.Series([1,2,3]))

In [None]:
import gc
import sys
import os
import os.path
from pathlib import Path
# print(os.path.realpath(__file__))
sys.path.append(str(Path(sys.argv[0]).absolute().parent.parent.parent.parent.parent))
base_repo = os.path.realpath(os.path.join(os.getcwd(), "../../../"))
print(base_repo)
sys.path.append(base_repo)
# add the entire folder to path
print(sys.path)
print(os.getcwd())


In [None]:
from src.discovery.rapids import data
from src.discovery.rapids import classifier


In [None]:

OUTPUT_DIR = os.path.realpath('./output/')
PATH_TO_CSV = os.path.realpath('../../data/raw/2022-10-26_hiscore_data.csv')
if PATH_TO_CSV is None or not os.path.exists(PATH_TO_CSV):
    print('set PATH_TO_CSV=/path/to/2022-10-26_hiscore_data.csv and run all')
    raise ValueError

df = cudf.read_csv(PATH_TO_CSV)

In [None]:
# This logic is roughly equivalent to app.py's train function
hiscoredata = data.hiscoreData(df, False)
del df


In [None]:

features = hiscoredata.features()


In [None]:

# no playerData()-related data, its already in features
# features_labeled = features
binary_classifier = classifier.classifier("binaryClassifier")
# Logic should be the same as app.py line 179: train the model 
x = features.copy()
y = x['label'].apply(lambda x: 0 if x == 'Real_Player' else 1)
# TODO: one-hot encode account status
print('unique account_status values:', x['account_status'].unique())
# TODO: using created_at, updated_at epoch times slightly improve binary classifier
# x['created_at'] = pd.to_datetime(x['created_at'], format="%Y-%m-%d %H:%M:%S").apply(lambda x: (x - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s"))
# x['updated_at'] = pd.to_datetime(x['updated_at'], format="%Y-%m-%d %H:%M:%S").apply(lambda x: (x - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s"))
# print(x['created_at'].head())
x.drop(columns=['label', 'label_id', 'name', 'created_at', 'updated_at', 'account_status', 'possible_ban', 'confirmed_ban'], inplace=True)
print('x columns:\n', x.columns)
print('x head:\n', x.head())
print('y head:\n', y.head())
x.describe()


In [None]:
# This logic is roughly equivalent to app.py's train function
def fit_score(classifier, x, y, rpt=True):
    # This logic is roughly equivalent to app.py's train function
    train_x, test_x, train_y, test_y = cuml.model_selection.train_test_split(
        x, y, test_size=0.2, random_state=42, stratify=y
    )
    del x
    del y
    gc.collect()
    classifier.fit(train_x, train_y)

    accuracy, roc_auc = classifier.score(test_y, test_x)
    if rpt:
        print(f'accuracy: {accuracy}, roc_auc: {roc_auc}')
    # OUTPUT: (0.9992126580557206, 0.9992126580557205)
    
    if rpt:
        print(
            cuml.metrics.confusion_matrix(
                y_true=test_y, 
                y_pred=classifier.predict(test_x),
                convert_dtype=True
            )
        )
    return accuracy


In [None]:

fit_score(binary_classifier, x, y)
print(binary_classifier.rfc.get_summary_text())
print(binary_classifier.rfc.get_detailed_text())


In [None]:

del binary_classifier.rfc
del binary_classifier
gc.collect()


In [None]:
# Output of block is x_multi, y_multi. Fiddle around with data here.
multi_classifier = classifier.classifier("multiClassifier")
# dt_multi_classifier = classifier.DTclassifier("DTmultiClassifier")
y_labels = features['label_id'].value_counts()
y_labels = y_labels[y_labels.values > 500]
# y_labels = [label for label, value in y_labels.items() if value > 500]
# To inspect just bots:
# y_labels.remove('Real_Player')
x_multi = features.copy()
x_multi = x_multi[x_multi['label_id'].isin(y_labels.index.unique())]
# label_to_id = dict((label, label_id) for label_id, label in enumerate(x_multi['label'].unique()))
y_multi = x_multi['label_id'] #.apply(lambda x: label_to_id[x])
x_multi.drop(columns=['label', 'label_id', 'name', 'created_at', 'updated_at', 'account_status', 'possible_ban', 'confirmed_ban'], inplace=True)

print('x_multi columns:\n', x_multi.columns)
print('x_multi head:\n', x_multi.head())
print('y_multi head:\n', y_multi.head())


In [None]:

fit_score(multi_classifier, x_multi, y_multi)
print(multi_classifier.rfc.get_summary_text())
print(multi_classifier.rfc.get_detailed_text())


In [None]:
del multi_classifier
del x_multi
del y_multi

In [None]:
# explainer = cuml.explainer.KernelExplainer(model=multi_classifier.rfc.predict, data=train_x_multi.iloc[:10], random_state=42)
# shap_values = explainer.shap_values(test_x_multi.iloc[:10])
# shap_values
# # explainer = cuml.explainer.TreeExplainer(model=multi_classifier.rfc)
# # shap_values = explainer.shap_values(test_x_multi.iloc[:100])
# # print(shap_values)