In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
import sys
import os
path = '/content/drive/My Drive'
sys.path.append(path)
os.chdir(path)
%cd BDA 21

/content/drive/My Drive/BDA 21


load_data

In [None]:
import pandas as pd


def load_dataset(path):
    data = pd.read_csv(path)
    y = data.loc[:, "label"].values.astype(int)
    X = data.iloc[:, 3:9].values
    return X, y

resample

In [None]:
!pip install pytorch-tabnet
!pip install imblearn

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [None]:
from imblearn.over_sampling import * 
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize
import numpy as np


def resampling(X, y):
    kmeans_sm = KMeansSMOTE(random_state=42, cluster_balance_threshold=0.05)
    X_res, y_res = kmeans_sm.fit_resample(X, y)
    # svm_sm = SVMSMOTE(random_state=42)
    # X_res2, y_res2 = svm_sm.fit_resample(X, y)
    # return np.vstack((X_res, X_res2)), np.hstack((y_res, y_res2))
    return X_res, y_res


def data_preprocess(X):
    return normalize(X, axis=0)


def cross_validation(X_train, Y_train):
    five_fold_data = list()
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    for train_index, eval_index in kf.split(X_train):
        x_train, x_eval = X_train[train_index], X_train[eval_index]
        y_train, y_eval = Y_train[train_index], Y_train[eval_index]
        five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])
    return five_fold_data

use tabnet with pretraining

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score


def train(five_fold_data, X_pretrain):
    unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax' # "sparsemax"
    )

    unsupervised_model.fit(
        X_train=X_pretrain,
        eval_set=[X_pretrain],
        pretraining_ratio=0.8,
    )
    clf = TabNetClassifier(
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"step_size":10, # how to use learning rate scheduler
                        "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax' # This will be overwritten if using pretrain model
    )

    model_sets = list()
    # indices = np.random.choice(5, size=3, replace=False)
    for i, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        clf.fit(
            X_train=x_train, y_train=y_train, 
            eval_set=[(x_train, y_train), (x_eval, y_eval)],
            eval_name=['train', 'valid'],
            eval_metric=['auc', 'accuracy'],
            from_unsupervised=unsupervised_model
    )
        # if i in indices:
        model_sets.append(clf)
    return model_sets


def tabnet_test(model_sets, X_test, y_test):
    results = np.zeros((5, len(X_test)))
    for i, model in enumerate(model_sets):
        preds = model.predict_proba(X_test)
        results[i] = preds[:, 1]
    score = results.mean(axis=-2)
    pred = (score>0.85).astype(int)
    recall = recall_score(y_true=y_test, y_pred=pred)
    return accuracy_score(y_pred=pred, y_true=y_test), roc_auc_score(y_test, score), recall, pred

In [None]:
from sklearn.ensemble import AdaBoostClassifier


def tree_decider(X_train, y_train, X_test, y_test):
    clf = AdaBoostClassifier(n_estimators=100, random_state=0)  
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    prob = clf.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_pred=pred, y_true=y_test)
    auc = roc_auc_score(y_test, prob, average="samples")
    recall = recall_score(y_test, pred)
    return acc, auc, recall, pred

In [None]:
from sklearn.linear_model import LogisticRegressionCV


def linear_model(train_X, train_y, test_X, test_y):
    clf = LogisticRegressionCV(cv=5, penalty="l2", random_state=42, multi_class="ovr", solver="liblinear")
    clf.fit(train_X, train_y)
    pred = clf.predict(test_X)
    prob = clf.predict_proba(test_X)[:, 1]
    acc = accuracy_score(y_pred=pred, y_true=test_y)
    auc = roc_auc_score(test_y, prob, average="samples")
    recall = recall_score(test_y, pred)
    return acc, auc, recall, pred

ensemble

In [None]:
def ensemble(results_tabnet, results_linear, results_tree, y_test):
    final_results = np.empty((len(results_tabnet)))
    for i in range(len(results_tabnet)):
        final_results[i] = np.random.choice([results_tabnet[i], results_linear[i], results_tree[i]])
    acc = accuracy_score(y_pred=final_results, y_true=y_test)
    recall = recall_score(y_test, final_results)
    return acc, recall

In [None]:
def full_train():
    train_X, train_y = load_dataset("player_stats_2019_2020.csv")
    test_X, test_y = load_dataset("player_stats_2020_2021.csv")
    train_X_norm = data_preprocess(train_X)
    test_X_norm = data_preprocess(test_X)
    acc, auc, recall, results_linear = linear_model(train_X_norm, train_y, test_X_norm, test_y)
    print("linear model: acc is {:.5f}, auc is {:.5f} and recall is {:.5f}".format(acc, auc, recall))
    acc, auc, recall, results_tree = tree_decider(train_X_norm, train_y, test_X_norm, test_y)
    print("tree model: acc is {:.5f}, auc is {:.5f} and recall is {:.5f}".format(acc, auc, recall))
    
    train_X_re, train_y_re = resampling(train_X, train_y)

    five_fold_data = cross_validation(train_X_re, train_y_re)
    models = train(five_fold_data, train_X)
    acc_score, auc_score, the_recall_score, results_tabnet = tabnet_test(models, test_X, test_y)
    print("accuracy score is {:.5f}, roc auc score is {:.5f} and recall score is {:.5f}".format(acc_score, auc_score, the_recall_score))

    return results_tabnet, results_linear, results_tree, test_y

In [None]:
results_tabnet, results_linear, results_tree, test_y = full_train()

linear model: acc is 0.97593, auc is 0.97321 and recall is 0.33333
tree model: acc is 0.97593, auc is 0.93517 and recall is 0.33333
Device used : cuda
epoch 0  | loss: 7.01152 | val_0_unsup_loss: 431.34476|  0:00:00s
epoch 1  | loss: 5.52149 | val_0_unsup_loss: 126.33997|  0:00:00s
epoch 2  | loss: 3.8534  | val_0_unsup_loss: 48.93094|  0:00:00s
epoch 3  | loss: 3.3239  | val_0_unsup_loss: 25.75055|  0:00:00s
epoch 4  | loss: 2.89857 | val_0_unsup_loss: 23.25002|  0:00:00s
epoch 5  | loss: 2.76928 | val_0_unsup_loss: 17.08224|  0:00:00s
epoch 6  | loss: 2.85343 | val_0_unsup_loss: 8.42017 |  0:00:00s
epoch 7  | loss: 2.41335 | val_0_unsup_loss: 4.22153 |  0:00:00s
epoch 8  | loss: 2.36818 | val_0_unsup_loss: 2.89264 |  0:00:00s
epoch 9  | loss: 2.18703 | val_0_unsup_loss: 2.54389 |  0:00:00s
epoch 10 | loss: 2.02314 | val_0_unsup_loss: 2.36979 |  0:00:00s
epoch 11 | loss: 1.94401 | val_0_unsup_loss: 2.16618 |  0:00:00s
epoch 12 | loss: 1.82779 | val_0_unsup_loss: 2.06929 |  0:00:00s
ep

In [None]:
final_acc, final_recall = ensemble(results_tabnet, results_linear, results_tree, test_y)
print("final accuracy score is {:.5f}, recall score is {:.5f}".format(final_acc, final_recall))

final accuracy score is 0.82778, recall score is 0.80000
