In [1]:
import numpy as np

Load data

In [2]:
import pandas as pd
df_test = pd.read_csv('data/test1.csv', sep=',')
df_train = pd.read_csv('data/train1.csv', sep=',')

Binarize data

In [3]:
import copy

def dummy_encode_categorical_columns(data):
    result_data = copy.deepcopy(data)
    for column in data.columns.values:
        result_data = pd.concat(
            [result_data, pd.get_dummies(result_data[column], prefix = column, prefix_sep = ': ')],
            axis = 1
        )
        del result_data[column]
    return result_data

In [4]:
def parse_file(name):
    df = pd.read_csv(name, sep=',')
    df = df.replace(to_replace='positive', value=1)
    df = df.replace(to_replace='negative', value=0)
    y = np.array(df['V10'])
    del df['V10']
    bin_df = dummy_encode_categorical_columns(df)
    return np.array(bin_df).astype(int), y

In [5]:
X_train, y_train = parse_file('data/train1.csv')
X_test, y_test = parse_file('data/test1.csv')

Split to train and test

In [6]:
X_train_pos = X_train[y_train == 1]
X_train_neg = X_train[y_train == 0]

Make predictions

In [7]:
y_pred = []
for test_obj in X_test:
    pos = np.sum(test_obj == X_train_pos) / float(len(X_train_pos))
    neg = np.sum(test_obj == X_train_neg) / float(len(X_train_neg))
    if (pos > neg):
        y_pred.append(1)
    else:
        y_pred.append(0)

y_pred = np.array(y_pred)

Evaluate predictions

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [9]:
TP = np.sum(y_test * y_pred)
TN = np.sum(y_test + y_pred == 0)
FP = np.sum((y_test  == 0) * (y_pred == 1))
FN = np.sum((y_test  == 1) * (y_pred == 0))
TPR = float(TP) / np.sum(y_test == 1)
TNR = float(TN) / np.sum(y_test == 0)
FPR = float(FP) / (TP + FN)
NPV = float(TN) / (TN + FN)
FDR = float(FP) / (TP + FP)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

In [10]:
measure_names = [
    'True Positive', 'True Negative', 'False Positive', 'False Negative',
    'True Positive Rate', 'True Negative Rate', 'Negative Predictive Value', 'False Positive Rate', 'False Discovery Rate',
    'Accuracy', 'Precision', 'Recall'
]
measure_vals = [TP, TN, FP, FN, TPR, TNR, NPV, FPR, FDR, acc, prec, rec]

for name, v in zip(measure_names, measure_vals):
    v_verb = f"{v:.0f}" if int(v)==v else f"{v:.4f}"
    print(f"{name}: {v_verb}")

True Positive: 34
True Negative: 24
False Positive: 8
False Negative: 27
True Positive Rate: 0.5574
True Negative Rate: 0.7500
Negative Predictive Value: 0.4706
False Positive Rate: 0.1311
False Discovery Rate: 0.1905
Accuracy: 0.6237
Precision: 0.8095
Recall: 0.5574
