###### Loading the Data

In [1]:
import numpy as np
import time

In [2]:
def loaddata(a, b):
    import h5py
    with h5py.File(a+'.h5','r') as H:
        data = np.copy(H['data'])
    with h5py.File(b+'.h5','r') as H:
        label = np.copy(H['label'])

    X = np.reshape(data,(data.shape[0],-1))
    X=X.T
    Y=np.reshape(label, (label.shape[0],1))
    X=X/255
    return X, Y

###### Validation with 20% data

In [3]:
def Validation_dataset(X, Y, percent):
    k=int((percent/100)*X.shape[1])
    rand= np.random.permutation(X.shape[1])
    rand= rand[0:k]
    All_index=np.arange(X.shape[1])
    a=set(rand)
    b=set(All_index)
    c=b-a
    a=sorted(a)
    c=sorted(c)

    Train_Data_mat=np.zeros((X.shape[0],len(c)))
    Train_labels= np.zeros((len(c)))
    Val_Data_mat=np.zeros((X.shape[0], len(a)))
    Val_labels= np.zeros((len(a)))
    for i in range(len(c)):
        Train_Data_mat[:,i]=X[:,c[i]]
        Train_labels[i]= Y[c[i]]
    for i in range(len(a)):
        Val_Data_mat[:,i]=X[:,a[i]]
        Val_labels[i]= Y[a[i]] 
        
    return Train_Data_mat, Train_labels, Val_Data_mat, Val_labels

###### Data Preprocessing:  Normalisation & SVD for rank reduction

In [4]:
def SVD(X, var_desired):
    U, s, Vt= np.linalg.svd(X, full_matrices=False)
    sum_ev_sq=np.sum(s**2)
    s_square=np.square(s)
    cumsum_s=np.cumsum(s_square)
    var_arr=cumsum_s/sum_ev_sq
    k=np.argmax(var_arr>var_desired)
    k=k+1
    S=np.diag(s)
    X_hat_reconst=U[:,0:k].dot(S[0:k,0:k]).dot(Vt[0:k,:])
    #SSE = np.sum((data - data_hat_reconstructed)**2)
    n=k
    comp_ratio = (X.shape[1]*n + n + X.shape[0]*n)/(X.shape[1] * X.shape[0])
    return X_hat_reconst, k, comp_ratio      

###### Logistic Regression Classifier using Gradient Descent 

In [5]:
#binary Logisitic Regression classifier
def binry_classifier(X, Y, iterations, Eta, lamda):
    import time
    row_all_one=np.ones((1,len(X.T)))     #adding a row of all one
    X=np.concatenate((row_all_one, X), 0)
    Wk=np.zeros(len(X))
    Wk1=Wk
    print (row_all_one.shape, X.shape, Wk.shape)
    for j in range(iterations):
        labels_pred=[]
        Wk=Wk1
        for i in range(len(X.T)):
            exp_part=np.dot((X[:,i]), Wk)
            exp_part=-1*exp_part
            exp_part=np.exp(exp_part)
            if(exp_part<=1):
                labels_pred.append(1)
            else:
                labels_pred.append(0)
        labels_pred=np.asarray(labels_pred)
        Comp_Grad_1=np.sum(X*labels_pred, axis=1)
        Comp_Grad_2=np.sum(X*Y, axis=1)
        Grad=Comp_Grad_1-Comp_Grad_2
        Grad=Grad/len(X.T) 
        Grad=Grad+ lamda*Wk
        #print(Grad.shape)
        Wk1= Wk1 - Eta*Grad
        if(abs((np.linalg.norm(Wk)-np.linalg.norm(Wk1)))<1e-12):   #convergence
            diff=abs(np.linalg.norm(Wk)-np.linalg.norm(Wk1))
            print("The value of norm is", diff)
            break
    diff=np.linalg.norm(Wk)-np.linalg.norm(Wk1)
    return Wk1, labels_pred      

In [6]:
def Confusn_mat(labels_pred, Y):
    lst=[]
    true_pos=0
    false_posit=0
    false_negat=0
    true_neg=0
    for i in range(len(labels_pred)):
        if (labels_pred[i]== 1 and Y[i]==1):
            true_pos=true_pos +1
        elif (labels_pred[i]==1 and Y[i]==0):
            false_posit=false_posit+1
        elif (labels_pred[i]==0 and Y[i]==1):
            false_negat=false_negat+1
        else:
            true_neg=true_neg+1
    lst=[true_pos, false_posit, false_negat, true_neg]
    lst=np.asarray(lst)
    return lst

In [17]:
# Performance Metrics
def Parameters(true_pos, false_posit, false_negat, true_neg):
    print(true_pos, false_posit, false_negat, true_neg)
    lst=[]
    Accuracy=((true_pos+true_neg)/(true_pos+true_neg+false_posit+false_negat))*100
    Accuracy = np.round(Accuracy,2)
    Precision= true_pos/(true_pos+false_posit) 
    Precision = np.round(Precision,2)
    Recall= true_pos/(true_pos+false_negat)
    Recall = np.round(Recall,2)
    F_measure=(2*true_pos)/(2*true_pos + false_negat+ false_posit)
    F_measure = np.round(F_measure,2)
    TPR=true_pos/(true_pos + false_negat)
    TPR = np.round(TPR,2)
    FPR= false_posit/(false_posit + true_neg)
    FPR = np.round(FPR,2)
    lst=[Accuracy, Precision, Recall, F_measure, TPR, FPR]
    #print(Accuracy, Precision, Recall, F_measure, TPR, FPR)
    lst=np.asarray(lst)
    return lst

In [8]:
# Learning One Vs All implementation
def one_vs_All_train(X, Y):
    unique, counts=np.unique(Y, return_counts=True)
    opt_W=np.zeros((len(unique), X.shape[0]+1))
    labels_pred=np.zeros((len(unique), len(Y)))
    for j in range(len(unique)):
        Y_bin=np.zeros(len(Y))
        lst=[]
        for i in range(len(Y)):
            if (Y[i]==j):
                Y_bin[i]=1
            else:
                Y_bin[i]=0
        opt_W[j], labels_pred[j] =binry_classifier(X, Y_bin, 200, 1e-5, np.exp(-75))
        lst=Confusn_mat(labels_pred[j], Y_bin)
        print("The confmat for class label {} is {}" .format(j,lst))
    return opt_W, labels_pred

###### Training the Classifier One Vs All

In [9]:
start_time=time.time()
X, Y=loaddata("images_training", "labels_training")
variance=0.96
sample_times=3
unique, counts=np.unique(Y, return_counts=True)
opt_W_Avg=np.zeros((len(unique), X.shape[0]+1))
for i in range(sample_times):
    X_post_Val_data, Y_post_Val_labels, Val_Data, Val_labels=Validation_dataset(X, Y, 20)
    X_post_Val_data, n_components, comp_ratio = SVD(X_post_Val_data,variance)
    
   # Val_Data, n_components_Val, comp_ratio_Val= SVD(Val_Data,variance)
    opt_W, labels_pred = one_vs_All_train(X_post_Val_data, Y_post_Val_labels)
    opt_W_Avg= opt_W_Avg+ opt_W
opt_W_Avg=opt_W_Avg/sample_times
print("Running time: "+ str(int((time.time()-start_time)/60))+" minutes")

(1, 24000) (785, 24000) (785,)
The confmat for class label 0 is [ 1652   308   741 21299]
(1, 24000) (785, 24000) (785,)
The confmat for class label 1 is [ 2270   149   111 21470]
(1, 24000) (785, 24000) (785,)
The confmat for class label 2 is [ 1808  1183   590 20419]
(1, 24000) (785, 24000) (785,)
The confmat for class label 3 is [ 1450    97   945 21508]
(1, 24000) (785, 24000) (785,)
The confmat for class label 4 is [    8     1  2401 21590]
(1, 24000) (785, 24000) (785,)
The confmat for class label 5 is [ 2228   325   203 21244]
(1, 24000) (785, 24000) (785,)
The confmat for class label 6 is [    0     0  2362 21638]
(1, 24000) (785, 24000) (785,)
The confmat for class label 7 is [ 1693   103   664 21540]
(1, 24000) (785, 24000) (785,)
The confmat for class label 8 is [ 2180   211   214 21395]
(1, 24000) (785, 24000) (785,)
The confmat for class label 9 is [ 2229   242   251 21278]
(1, 24000) (785, 24000) (785,)
The confmat for class label 0 is [ 1822   433   587 21158]
(1, 24000)

###### Testing Procedure

In [18]:
#Testing over Training Data
pred_labels = testing_All(X_post_Val_data, opt_W_Avg)
acc=0
for i in range(len(Y_post_Val_labels)):
    if(Y_post_Val_labels[i]==pred_labels[i]):
        acc=acc+1
print("The accuracy is {} %".format((acc/(len(Y_post_Val_labels)))*100))
Conf_mat4All_train, Param_mat4All_train= FindConfMat(pred_labels, Y_post_Val_labels)
#print (Conf_mat4All_train, np.round(Param_mat4All_train,2))

THe accuracy is 77.53750000000001 %
1827.0 499.0 592.0 21082.0
2195.0 127.0 141.0 21537.0
1610.0 669.0 800.0 20921.0
2227.0 1347.0 181.0 20245.0
1970.0 1459.0 435.0 20136.0
2113.0 103.0 338.0 21446.0
20.0 0.0 2389.0 21591.0
2168.0 431.0 154.0 21247.0
2279.0 594.0 119.0 21008.0
2200.0 162.0 242.0 21396.0


In [19]:
#Testing over Validation Data
pred_labels = testing_All(Val_Data, opt_W_Avg)
acc=0
for i in range(len(Val_labels)):
    if(Val_labels[i]==pred_labels[i]):
        acc=acc+1
print("The accuracy is {} %".format((acc/(len(Val_labels)))*100))
Conf_mat4All_valid, Param_mat4All_valid= FindConfMat(pred_labels, Val_labels)
#print (np.round(Conf_mat4All_valid,2), np.round(Param_mat4All_valid,2))

THe accuracy is 78.53333333333333 %
449.0 120.0 143.0 5288.0
580.0 28.0 40.0 5352.0
415.0 159.0 195.0 5231.0
551.0 328.0 43.0 5078.0
509.0 347.0 115.0 5029.0
500.0 29.0 77.0 5394.0
7.0 0.0 551.0 5442.0
542.0 101.0 31.0 5326.0
579.0 135.0 25.0 5261.0
580.0 41.0 68.0 5311.0


In [20]:
# import testing data
X_test, Y_test=loaddata("images_testing", "labels_testing_2000")
X_test, n_components_Test, comp_ratio_Test =SVD(X_test, variance)

#Testing on 2000 Test Data
pred_labels = testing_All(X_test, opt_W_Avg)
pred_labels= pred_labels
acc=0
for i in range(len(Y_test)):
    if(Y_test[i]==pred_labels[i]):
        acc=acc+1
print("THe accuracy for test is {} %".format((acc/(len(Y_test)))*100))
Conf_mat4All_test, Param_mat4All_test= FindConfMat(pred_labels, Y_test)
#print (np.round(Conf_mat4All_test,2), np.round(Param_mat4All_test,2))

THe accuracy for test is 76.95 %
133.0 41.0 45.0 1781.0
181.0 10.0 10.0 1799.0
134.0 49.0 76.0 1741.0
179.0 115.0 12.0 1694.0
166.0 135.0 46.0 1653.0
174.0 6.0 40.0 1780.0
3.0 1.0 197.0 1799.0
188.0 30.0 10.0 1772.0
207.0 49.0 12.0 1732.0
174.0 25.0 13.0 1788.0


In [10]:
#Testing One Vs All
def one_vs_All_test(X_test, Y_test, opt_W):
    unique, counts=np.unique(Y_test, return_counts=True)
    labelas_pred=np.zeros((len(unique), len(Y_test)))
    Conf_mat4All= np.zeros((len(unique), 4))
    Param_mat4All= np.zeros((len(unique), 6))
    for j in range(len(unique)):
        Y_bin=np.zeros(len(Y_test))
        for i in range(len(Y_test)):
            if (Y_test[i]==j):
                Y_bin[i]=1
            else:
                Y_bin[i]=0
        labels_pred[j] = testing(X_test, Y_bin, opt_W[j])
        Conf_mat4All[j]= Confusn_mat(labels_pred[j] , Y_bin)
        Param_mat4All[j]= Parameters(Conf_mat4All[j][0], Conf_mat4All[j][1], Conf_mat4All[j][2], Conf_mat4All[j][3])
        print (Param_mat4All[j] )
        print()
    return labels_pred, Conf_mat4All, Param_mat4All

In [11]:
def testing(X, Y, W):
    row_all_one=np.ones((1,len(X.T)))
    X=np.concatenate((row_all_one, X), 0)
    pred_label=[]
    for i in range(len(Y)):
        exp_part=np.dot((X[:,i]), W)
        #print(W)
        exp_part=-1*exp_part
        exp_part=np.exp(exp_part)
        if(exp_part<=1):
            pred_label.append(1)
        else:
            pred_label.append(0)
       
    return pred_label

In [12]:
def testing_All(X, W):
    row_all_one=np.ones((1,len(X.T)))
    X=np.concatenate((row_all_one, X), 0)
    probabilities=np.dot(W, X)
    pred_labels=np.argmax(probabilities, axis=0)
    return pred_labels

In [13]:
def FindConfMat(pred_labels, Y_test):
    unique, counts=np.unique(Y_test, return_counts=True)
    Conf_mat4All= np.zeros((len(unique), 4))
    Param_mat4All= np.zeros((len(unique), 6))
    for j in range(len(unique)):
        pred_labels_bin=[]
        Y_test_bin=[]
        for i in range(len(Y_test)):
            if (pred_labels[i]==j):
                pred_labels_bin.append(1)
            else:
                pred_labels_bin.append(0)
            if(Y_test[i]==j):
                Y_test_bin.append(1)
            else:
                Y_test_bin.append(0)
        Conf_mat4All[j]=Confusn_mat(pred_labels_bin, Y_test_bin)
        Param_mat4All[j]=Parameters(Conf_mat4All[j][0], Conf_mat4All[j][1], Conf_mat4All[j][2], Conf_mat4All[j][3])
    return Conf_mat4All, Param_mat4All