In [25]:
import numpy as np
import scipy.stats as st

In [2]:
### Constantes ###
DATASET_NAME = 'AIDS'
CENTRALITY_NAME = 'pagerank'

In [3]:
def get_data(dataset_name, percentage, centrality_name):
    filename = f'../results/h_knn/{dataset_name}/prediction_{percentage}_{centrality_name}.npy'
    with open(filename, 'rb') as f:
        lbls_test = np.load(f)
        predictions = np.load(f)
    
    return lbls_test, predictions


In [4]:
def calc_accuracy(lbls_test, predictions):
    correctly_classified = np.sum(lbls_test == predictions)
    accuracy = 100 * (correctly_classified / len(lbls_test))
    
    return round(accuracy, 2)

In [20]:
def get_X(lbls_test, predictions_100, predictions_to_test):
    X_t = []
    for ground_truth, pred_100, pred_to_test in zip(lbls_test, predictions_100, predictions_to_test):
        is_100_correct = pred_100 == ground_truth
        is_pred_to_test_correct = pred_to_test == ground_truth
        val = 1 * (is_100_correct and not is_pred_to_test_correct) + \
            (-1) * (not is_100_correct and is_pred_to_test_correct)
        X_t.append(val)
            
    return np.array(X_t)

def calc_Z_score(lbls_test, predictions_100, predictions_to_test):
    X_t = get_X(lbls_test, predictions_100, predictions_to_test)
    
    mu_x = np.mean(X_t)
    var_x = np.var(X_t)
    
    Z = mu_x / (np.sqrt(var_x / len(lbls_test)))
    
    return Z

In [24]:
lbls_test, predictions_100 = get_data(DATASET_NAME, '100', CENTRALITY_NAME)

percentages = [100, 80, 60, 40, 20]
for percentage in percentages:
    _, predictions = get_data(DATASET_NAME, str(percentage), CENTRALITY_NAME)
    
    accuracy = calc_accuracy(lbls_test, predictions)
    z_score = calc_Z_score(lbls_test, predictions_100, predictions)
    p_value = st.norm.cdf(z_score)
    
    
    print(f'Percentage: {percentage}')
    print(f'Accuracy {accuracy}%')
    print(f'Z score {z_score}')
    print(f'P-value {p_value}')
    print('-------')

Percentage: 100
Accuracy 98.87%
Z score nan
P-value nan
-------
Percentage: 80
Accuracy 99.13%
Z score -1.1552140810056537
P-value 0.12400138493007745
-------
Percentage: 60
Accuracy 98.67%
Z score 0.7747516350666294
P-value 0.7807567833855883
-------
Percentage: 40
Accuracy 99.27%
Z score -1.2799026173366892
P-value 0.10028969351284045
-------
Percentage: 20
Accuracy 95.73%
Z score 5.557274394191939
P-value 0.9999999862990092
-------


  Z = mu_x / (np.sqrt(var_x / len(lbls_test)))


In [None]:
def calc_precision()