In [1]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import scipy.io as sio
from sklearn import metrics

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [2]:
def Norm( list_x ):
    n = len(list_x)
    mx = 0
    for i in range(n):
        mx = max( mx, list_x[i].shape[0] )
    
    for i in range(n):
        dif = abs( mx - list_x[i].shape[0] )
        list_x[i] = np.r_[ list_x[i], np.zeros( ( dif,list_x[i].shape[1] ) ) ]

    return list_x

In [3]:
def GetFea( _mat ):
    
    _mat_fea = []
    n = len(_mat)
    sift = cv2.xfeatures2d.SIFT_create(200)
    for i in range(n):
        img_rgb = _mat[i].reshape(3,32,32)
        r = img_rgb[0].reshape(1024,1)
        g = img_rgb[1].reshape(1024,1)
        b = img_rgb[2].reshape(1024,1)
        img_rgb = np.hstack( (r,g,b) ).reshape(32,32,3)
        img_gray = cv2.cvtColor( img_rgb, cv2.COLOR_RGB2GRAY )
        kps, des = sift.detectAndCompute( img_gray, None )
        if des is None:
            des = np.zeros( (14,128) )
        _mat_fea.append( des )
    _mat_fea = Norm( _mat_fea )
    #print("The type of _mat_fea:", type(_mat_fea))
    return _mat_fea

In [4]:
#load the dataset from the local url
def loaddata( dataurl, __gets, __result ):
    
    initdata = sio.loadmat( dataurl )
    keys = list( initdata.keys() )
    values = list( initdata.values() )
    
    dataset = initdata[__gets]
    dataset_fea = GetFea( dataset )
    
    label = initdata[__result]
    set_k = np.max( label )
    
    return initdata, dataset, set_k, label, dataset_fea

In [5]:
#calculate the dist btw matrix_A and matrix_B
def calcdist( Mat_A, Mat_B, i, j ):
    
    #print(i,j)
    na, ma = Mat_A.shape
    nb, mb = Mat_B.shape
    mx = max( na, nb )
    difa = abs( na - mx )
    difb = abs( nb - mx )
    now_A = np.r_[ Mat_A, np.zeros((difa,ma)) ]
    now_B = np.r_[ Mat_B, np.zeros((difb,mb)) ]
    
    dis = np.sqrt( np.sum( np.power( now_A-now_B, 2 ) ) )
    return dis

In [6]:
def randcents( dataset, k, label ):
    _M = []
    initcent = []
    cents_fea = []
    n = dataset.shape[0]
    for i in range(n):
        nw = label[i]
        flg = 0
        for j in range(len(_M)):
            if _M[j] == nw:
                flg = 1
        if flg == 0:
            initcent.append(dataset[i])
            _M.append(nw)
    cents_fea = GetFea( initcent )
    return cents_fea, initcent

In [7]:
def K_Means( dataset, k, result_label, mx_it_tme = 15 ):
    
    cents_fea, cents = randcents( dataset, k, result_label )
    #print("The typed of cents_fea:", type(cents_fea))
    init_cents_fea = cents_fea.copy()
    dataset_fea = GetFea( dataset )
    print("The len of dataset_fea", len(dataset_fea))
    '''
    print("The init cents")
    print(type(cents))
    for i in cents:
        print(i)
    print("End of check")
    '''
    
    rows, cols = dataset.shape
    modify = True
    #iter time
    it_count = 0
    #the init result [0...][0...][...]..[]
    result_mat = np.mat( np.zeros((rows,2)) )
    result_mat = np.full( result_mat.shape, -1 )
    
    while modify and it_count < mx_it_tme:
        it_count += 1
        modify = False
        
        for i in range(rows):
            
            mndis = np.inf
            mnind = 0
            
            for j in range(k):
                dis = calcdist( dataset_fea[i], cents_fea[j], i, j )
                if dis < mndis:
                    mnind = j
                    mndis = dis
            
            if result_mat[i,0] != mnind:
                modify = True
            result_mat[i, :] = mnind, mndis**2
            
        for i in range(k):
            sm = np.full( dataset_fea[0].shape, 0 )
            flg = 0
            for j in range(rows):
                if result_mat[j,0] == i:
                    sm = sm + dataset_fea[j]
                    flg += 1
            if flg != 0:
                cents_fea[i] = sm / flg;
    return cents, init_cents_fea, cents_fea, result_mat

In [8]:
def main( filename, __gets, __result ):
    
    initdata, dataset, set_k, label, dataset_fea = loaddata( filename, __gets, __result )
    
    rows, cols = dataset.shape
    print(rows)
    cents, init_cents_fea, cents_fea, result_mat = K_Means( dataset, set_k, label )
    
    #sbhow the result of i_th cluster
    #draw the total clusters

    clusters = []
    '''
    clusters.append(-1)
    for i in range(rows):
        bel = result_mat[i,0]
        flg = 1
        for j in clusters:
            if bel == j:
                flg = 0
        if flg:
            clusters.append(bel)
            img = dataset[i].reshape(32,32).T
            plt.subplot( 4, 5, len(clusters)-1 )
            plt.imshow(img)
            plt.axis('off')
    plt.show()
    print(len(clusters)-1)
    for i in clusters:
        print(i, end=" ")
    print()
    '''
    return cents, result_mat, clusters, dataset_fea, cents_fea

In [9]:
#result test
dataURL = ['data/COIL20.mat', 'data/Yale_32x32.mat', 'data/data_batch_1.mat']
_gets = [ 'fea', 'data' ]
_result = [ 'gnd', 'labels' ]
initdata, dataset, set_k, label, dataset_fea = loaddata( dataURL[2], _gets[1], _result[1] )
cents, result_mat, clusters, dataset_fea, cents_fea = main( dataURL[2], _gets[1], _result[1] )

2000
The len of dataset_fea 2000


In [10]:
def brute_force_evalue( label_true, label_pred ):
    n = len(label_true)
    print(n)
    ct = 0
    for i in range(n):
        if label_true[i] == label_pred[i]+1:
            ct += 1
    print(ct)
    ratio = ct / n
    return ratio

In [12]:
label_true = label
label_pred = []
n = len(result_mat)
for i in range(n):
    label_pred.append( result_mat[i,0] )
ratio = brute_force_evalue( label_true,label_pred )
print(ratio)

2000
212
0.106


In [81]:
def E_value_RI( init_labels, lst_labels ):
    TP = TN = FP = FN = 0
    n = len(init_labels)
    for i in range(n):
        for j in range(i+1,n):
            if init_labels[i]==init_labels[j] and lst_labels[i,0]==lst_labels[j,0]:
                TP += 1
            if init_labels[i]!=init_labels[j] and lst_labels[i,0]!=lst_labels[j,0]:
                TN += 1
            if init_labels[i]!=init_labels[j] and lst_labels[i,0]==lst_labels[j,0]:
                FP += 1
            if init_labels[i]==init_labels[j] and lst_labels[i,0]!=lst_labels[j,0]:
                FN += 1
        
    ratio = ( TP + TN ) / ( TP + TN + FP + FN )
    return ratio

In [82]:
#RI_E_value
precent = E_value_RI( label, result_mat )
print(precent)

0.7906438219109555


In [83]:
#metric ecalue
label_true = label.ravel()
label_pred = []
for i in range(len(result_mat)):
    label_pred.append( result_mat[i,0] )
ratio = metrics.adjusted_rand_score( label_true, label_pred )
print(ratio)

0.02535855873385356
