In [6]:
import numpy as np
from scipy import stats
from sklearn.metrics import f1_score

rng = np.random.RandomState(1415)

In [7]:
def gen_set(*num_classX):
    data_set = []
    for i, num_class in enumerate(num_classX):
        data_set.append(np.full(num_class, i))
    data_set = np.hstack(data_set)
    
    return data_set

In [8]:
def sim_random_accuracy(trn_set_class_amounts, tst_set_class_amounts):
    trn_set = gen_set(*trn_set_class_amounts)
    tst_set = gen_set(*tst_set_class_amounts)

    nTrn = trn_set.size
    nTst = tst_set.size

    predictions = rng.choice(trn_set, size=nTrn, replace=False)[:nTst]
    predicts_on = rng.choice(tst_set, size=nTst, replace=False)
    
    f1 = f1_score(predicts_on, predictions, average="macro")

    correct_predictions = (predicts_on == predictions).sum()
    accuracy = correct_predictions / nTst
    return accuracy, f1

In [33]:
def analytic_random_acc(trn_set_class_amounts, tst_set_class_amounts):
    trn_set_class_amounts = np.array(trn_set_class_amounts)
    tst_set_class_amounts = np.array(tst_set_class_amounts)
    
    # print(trn_set_class_amounts)
    # print(tst_set_class_amounts)
    
    nTrn = trn_set_class_amounts.sum()
    nTst = tst_set_class_amounts.sum()
    # print(nTrn)
    # print(nTst)
    
    probs_Trn = trn_set_class_amounts / nTrn
    probs_Tst = tst_set_class_amounts / nTst
    # print(probs_Trn)
    # print(probs_Tst)
    
    accuracy = np.dot(probs_Trn, probs_Tst)
    return accuracy

# Simulating random guess accuracy for the following dataset:

    Training Set:
        * 1000 Class 1
        * 1000 Class 2
    Testing Set:
        * 1000 Class 1
        * 1000 Class 2

In [34]:
trn_set_class_amounts = [1000, 1000]
tst_set_class_amounts = [1000, 1000]

sim_Acc, sim_F1 = sim_random_accuracy(trn_set_class_amounts, tst_set_class_amounts)
ana_Acc = analytic_random_acc(trn_set_class_amounts, tst_set_class_amounts)

print(f"Simulated accuracy: {sim_Acc:.4f}")
print(f"Simulated F1-score: {sim_F1:.4f}")
print(f"Analytic accuracy: {ana_Acc:.4f}")

Simulated accuracy: 0.5080
Simulated F1-score: 0.5080
Analytic accuracy: 0.5000


# Simulating random guess accuracy for the following dataset:

    Training Set:
        * 1000 Class 1
        * 1000 Class 2
        * 1000 Class 3
    Testing Set:
        * 100 Class 1
        * 100 Class 2
        * 100 Class 3

In [35]:
trn_set_class_amounts = [1000, 1000, 1000]
tst_set_class_amounts = [100, 100, 100]

sim_Acc, sim_F1 = sim_random_accuracy(trn_set_class_amounts, tst_set_class_amounts)
ana_Acc = analytic_random_acc(trn_set_class_amounts, tst_set_class_amounts)

print(f"Simulated accuracy: {sim_Acc:.4f}")
print(f"Simulated F1-score: {sim_F1:.4f}")
print(f"Analytic accuracy: {ana_Acc:.4f}")

Simulated accuracy: 0.3300
Simulated F1-score: 0.3281
Analytic accuracy: 0.3333


In [38]:
trn_set_class_amounts = [231, 231, 1231, 312, 123, 12]
tst_set_class_amounts = [100, 231, 1231, 312, 123, 12]

sim_Acc, sim_F1 = sim_random_accuracy(trn_set_class_amounts, tst_set_class_amounts)
ana_Acc = analytic_random_acc(trn_set_class_amounts, tst_set_class_amounts)

print(f"Simulated accuracy: {sim_Acc:.4f}")
print(f"Simulated F1-score: {sim_F1:.4f}")
print(f"Analytic accuracy: {ana_Acc}")

Simulated accuracy: 0.4022
Simulated F1-score: 0.1700
Analytic accuracy: 0.3964493889646125


# Simulated random guess accuracy for our SN dataset **WITH** data augmentation.

In [54]:
trn_set_class_amounts = [1058, 1141, 1111, 1068, 1064, 1106, 1089, 1062, 1120, 1060, 1120, 1062, 1062, 1102, 1060, 1064]
tst_set_class_amounts = [1056, 185, 131, 4, 34, 32, 112, 18, 93, 7, 94, 111, 25, 46, 6, 3]

sim_Acc, sim_F1 = sim_random_accuracy(trn_set_class_amounts, tst_set_class_amounts)
ana_Acc = analytic_random_acc(trn_set_class_amounts, tst_set_class_amounts)

print(f"Simulated accuracy: {sim_Acc:.4f}")
print(f"Simulated F1-score: {sim_F1:.4f}")
print(f"Analytic accuracy: {ana_Acc:.4f}")

Simulated accuracy: 0.0603
Simulated F1-score: 0.0387
Analytic accuracy: 0.0622


# Simulated random guess accuracy for our SN dataset **WITHOUT** data augmentation.

In [56]:
trn_set_class_amounts = [1058, 163, 101, 12, 28, 79, 99, 9, 140, 5, 112, 118, 6, 58, 4, 19]
tst_set_class_amounts = [1056, 185, 131, 4, 34, 32, 112, 18, 93, 7, 94, 111, 25, 46, 6, 3]

sim_Acc, sim_F1 = sim_random_accuracy(trn_set_class_amounts, tst_set_class_amounts)
ana_Acc = analytic_random_acc(trn_set_class_amounts, tst_set_class_amounts)

print(f"Simulated accuracy: {sim_Acc:.4f}")
print(f"Simulated F1-score: {sim_F1:.4f}")
print(f"Analytic accuracy: {ana_Acc:.4f}")

Simulated accuracy: 0.3163
Simulated F1-score: 0.0621
Analytic accuracy: 0.3087
