In [1]:
import sys
import time

import math
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

Random_state = 3006

In [2]:
iris_data = load_iris()
ionosphere_data = np.genfromtxt("ionosphere.txt", delimiter=',', names=True, dtype=None)

In [3]:
iris_data

iris_X = iris_data['data']
iris_y = iris_data['target']

In [4]:
ionosphere_data
list_data = [list(x) for x in ionosphere_data]
iono_X = []
iono_y = []
for i,d in enumerate(list_data):
    iono_X.append([x for j,x in enumerate(d) if j!=len(d)-1])
    iono_y.append([x for j,x in enumerate(d) if j==len(d)-1])

iono_X = np.array(iono_X)
iono_y = np.hstack(iono_y)
iono_y = np.where(iono_y == 1, 1, 0) #converting to 0 and 1 for simplicity

In [5]:
def sort_it(list_to_sort):    
    try:
        for i in range(len(list_to_sort)):
            for j in range(len(list_to_sort) - 1):
                if list_to_sort[j] > list_to_sort[j+1]:
                    list_to_sort[j], list_to_sort[j + 1] = list_to_sort[j + 1], list_to_sort[j]
                
        return list_to_sort
    except:
        print(sys.exc_info())

def run_prediction(X, y, test_size=0.3, train_size=0.7, random_state=Random_state, k=1):
    
    if(test_size+train_size > 1 or test_size+train_size < 0):
        print("The sum of test_size and train_size should be between 0 and 1")
        return
    
    prediction = {"predicted_list" : [], "accuracy" : 0}
    
    try:
        # splitting test train data
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, train_size=train_size, random_state=Random_state)

        predicted_list = []
    
        # training on X_train, y_train by calculating Euclidean distances
        for d in range(len(X_test)):
            eucl_distances = []
            for i in range(len(X_train)):
                #eucl_distances.append([np.sqrt(np.sum(np.power(X_test[d,:] - X_train[i,:], 2))), i])
                eucl_distances.append([np.linalg.norm(X_test[d,:] - X_train[i,:]), i]) #used linalg for faster calculation time
                
            eucl_distances = sort_it(eucl_distances)
            #print(eucl_distances)
    
            targets = [y_train[eucl_distances[i][1]] for i in range(k)]
            #print(targets)
        
            predicted_list.append(max(targets, key=targets.count))

        #print(predicted_list)
        prediction['predicted_list'] = predicted_list
        prediction['accuracy'] = np.mean(prediction['predicted_list'] == y_test)
        return prediction
    
    except:
        print(sys.exc_info())

In [6]:
try :
    nn = run_prediction(iris_X, iris_y, test_size=0.3,train_size=0.7, random_state=Random_state, k=1)
    print(nn)
except:
    print(sys.exc_info())

{'predicted_list': [0, 1, 2, 0, 0, 1, 2, 1, 2, 0, 1, 2, 2, 1, 2, 0, 0, 2, 0, 2, 1, 1, 2, 1, 1, 0, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0], 'accuracy': 0.9555555555555556}


In [21]:
conformal(iris_X, iris_y, "iris dataset", test_size=0.3,train_size=0.7, random_state=107)


For iris dataset, The average false p-value : 0.028616352201257873 
The accuracy of prediction : 1.0 
The test error rate is : 0.0


In [22]:
conformal(iono_X, iono_y, "ionosphere dataset", test_size=0.3,train_size=0.7, random_state=107)

For ionosphere dataset, The average false p-value : 0.030845481049562678 
The accuracy of prediction : 0.9142857142857143 
The test error rate is : 0.08571428571428574


In [24]:
# function definition along with parameters
def conformity_score_Common(X_train,Y_train,X_test,Y_test,i,j,labels):
    
    import math
    # concatenating (X,y) of training set
    x_train_concat=np.concatenate((X_train,y_train[:,None]),axis=1)
    # concatenating (X,y) of one test set per function
    x_test_add=np.concatenate((X_test[i:i+1,],labels[j:j+1,None]),axis=1)
    # concatenating (X,y) of all training set and one test set
    x_train_test_concat=np.concatenate((x_train_concat,x_test_add))
    # concatenating y of training set and one test set
    y_concat=np.concatenate((y_train,labels[j:j+1]))
    # assign size of CF_score array
    CF_score=np.zeros(x_train_test_concat.shape[0])
    # iterate through each (x,y) of set and find distance
    for k in range(x_train_test_concat.shape[0]):
        arr_same=[]
        arr_diff=[]
        for l in range(x_train_test_concat.shape[0]):
            if k!=l:
                # find distance between sample of same class
                if x_train_test_concat[k,-1]==y_concat[l]:
                    arr_same.append(la.norm(x_train_test_concat[k,:-1]-x_train_test_concat[l,:-1]))
                # find distance between sample of different class
                else:
                    arr_diff.append(la.norm(x_train_test_concat[k,:-1]-x_train_test_concat[l,:-1]))
    # exception handling for ZeroDivisionError
    # using formula -(distance between nearest sample of same class)/(distance between nearest sample of diff class)
        try:
            CF_score[k]=(min(arr_diff)/min(arr_same)) 
        except ZeroDivisionError:
            CF_score[k]=np.inf
    return CF_score

In [25]:
# calculating p value of each CF_score array
def p_value(CF_score):
    count=0
    for m in range(CF_score.shape[0]):
    
        if CF_score[m]<=CF_score[-1]:
            count+=1
    p_value=(count/CF_score.shape[0])
    return p_value
import numpy as np
p_value(np.array([1,2,3,6,5]))

0.8

In [29]:
# calculation of p_value of each test sample for every label
def p_value_array_for_each(X_train,X_test,y_train,y_test,labels):
    
    labels_array=np.zeros((X_test.shape[0],labels.shape[0]))
    for ii in range(X_test.shape[0]):
        for jj in range(labels.shape[0]):
            labels_array[ii,jj]=p_value(conformity_score_Common(X_train,y_train,X_test,y_test,ii,jj,labels))
    return labels_array 

p_value_array_for_each(X_train,X_test,y_train,y_test,labels)

NameError: name 'X_train' is not defined

In [27]:

# calculation of average false p_value
def average_false_pvalue_CF(X_train,X_test,y_train,y_test):
    
    labels=np.unique(y_train)
    p_value_array_FP=p_value_array_for_each(X_train,X_test,y_train,y_test,labels)
    sum=0
    for ii in range(p_value_array_FP.shape[0]):
        for jj in range(labels.shape[0]):
            # summation of false_pvalues 
            if labels[jj]!=y_test[ii]:
                sum+=p_value_array_FP[ii,jj]
    # average of false_p_value
    return (sum/(X_test.shape[0]*(labels.shape[0]-1)))