In [117]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import pairwise_distances, euclidean_distances
from sklearn.cluster import KMeans
import sklearn.metrics as metrics

In [118]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [119]:
import configparser
import os.path as path

config = configparser.ConfigParser()
config.read('./metadata.cfg')

sections= config.sections()
data= config[sections[0]]
clustering= config[sections[1]]

In [120]:
datasets= {}
datasets[1]= 'dermatology'
datasets[2]= 'glass'
datasets[3]= 'haberman'
datasets[4]= 'ionosphere'
datasets[5]= 'iris'
datasets[6]= 'parkinsons'
datasets[7]= 'pendigit'
datasets[8]= 'seeds'
datasets[9]= 'vehiclesilhouettes'
datasets[10]= 'wine'
datasets[11]= 'rice1'
datasets[12]= 'rice2'
datasets[13]= 'rice3'
datasets[14]= 'rice4'
datasets[15]= 'ant1.7'
datasets[16]= 'zuzel'
datasets[17]= 'cm1'
datasets[18]= 'kc1'
datasets[19]= 'kc2'
datasets[20]= 'pc1'

In [121]:
datadir= data['datadir']
dataset= int(data['dataset'])

filepath= path.join(datadir, data[str(dataset)]+'.csv')
labelfilepath= path.join(datadir, data[str(dataset)]+'_labels.csv')
normalize= data['normalize']

In [122]:
print(filepath)

datasets\pc1.csv


In [123]:
data = pd.read_csv(filepath, header=None, index_col=False)
data_labels= pd.read_csv(labelfilepath, header=None, index_col=False).T.iloc[0].to_numpy()

actual_data= data.copy().to_numpy()

In [124]:
if normalize=='MMN':
    data = pd.DataFrame(MinMaxScaler().fit_transform(data))
else:
    raise TypeError(normalize, " is not correct")

In [125]:
data= data.to_numpy()

In [129]:
min_val= np.min(data)
if min_val >0:
    eps = np.e
else:
    eps = abs(min_val)+(np.e)
   


In [130]:
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x-y)**2))

In [132]:
euclidean_distances= pairwise_distances(data, metric='euclidean')
print('Eucli')
s_distances= pairwise_distances(data, metric=s_distance)
print('S')
w1= 0.5
distances= np.sqrt(w1*(euclidean_distances**2)+ (1-w1)*(s_distances**2))

Eucli
S


In [134]:
def plot_clusters(dataset, labels, centroids, title):
    pca = PCA(n_components=2)
    data_2d = pca.fit_transform(dataset)
    if len(centroids)>0:
        centroids_2d = pca.transform(centroids)
    clear_output(wait=True)
    plt.title(title)
    plt.scatter(x=data_2d[:,0], y=data_2d[:,1], c=labels)
    if len(centroids)>0:
        plt.scatter(x=centroids_2d[:,0], y=centroids_2d[:,1])
    plt.show()

In [136]:
def density(ind, threshold, distances):
    
    samples_array= np.where(distances[ind]< threshold)[0].tolist()
    dens= len(samples_array)

    return dens, samples_array

In [137]:
def avg_cluster_distance(samples_array, p_i, p, distances):
    if p_i<2:
        return 0 
    fil_dists= distances[samples_array][:,samples_array]
    tdis = np.triu(fil_dists, k=0).sum()

    return 2*tdis/(p_i*(p_i-1))

In [140]:
def loop_density_canopy(data, distances):
    remain= np.arange(0, n)

    mdis= mean_dis(distances)
    print("Mean Distance: ",mdis)
    cluster_centers= np.array([], dtype=int)
    clusters= []
    mdists= np.array([])

    
    p= np.array([])
    samples_arrays= []
    for i in range(n):
        p_i, samples_array= density(i, mdis, distances)
        samples_arrays.append(samples_array)
        p= np.append(p, p_i)

    max_p= max(p)
    max_ps= np.where(p==max_p)[0]
    a= np.array([])
    for i in max_ps:
        a_i= avg_cluster_distance(samples_arrays[i], p[i], p, distances)
        a= np.append(a, a_i)
    min_a= np.argmin(a[np.arange((len(max_ps)))])
    ind= max_ps[min_a]
    cluster_centers= np.append(cluster_centers, remain[ind])
    _, cluster_samples= density(ind, mdis, distances)
    clusters.append(remain[cluster_samples])
    remain= np.delete(remain, cluster_samples)
    mdists= np.append(mdists, mdis)

    cluster_idx= 1
    print("No.of remaining elements: ", len(remain))
    print('\n')

    
    while len(remain)>0:

        dists= distances[:,remain][remain]

        print("Mean Distance: ", mdis)

        r= len(remain)
        p= np.zeros((r))
        a= np.zeros((r))
        s= np.zeros((r))
        w= np.zeros((r))
        for i in range(r):
            p_i, samples_array= density(i, mdis, dists)
            a_i= avg_cluster_distance(samples_array, p_i, p, dists)
            p[i]= p_i
            a[i]= a_i
        
        for i in range(r):
            s_i= cluster_distance(i, p[i], p, dists)
            s[i]= s_i
            w[i]= weight_product(p[i], a[i], s_i)
        

        max_w= max(w)
        
        min_p= 0.03*n  
        print("Maximum Density: ",max(p))
        removed= np.array([], dtype=int)
        if max_w==0 or max(p)<min_p:
            print("No.of elements remaining: ", len(remain))
            print(p)
            for i in remain:    
                cen_dists= distances[:,cluster_centers][i]
                min_cen= np.argmin(cen_dists) 
                if(cen_dists[min_cen]< 1.5*mdists[min_cen]):
                    clusters[min_cen]= np.append(clusters[min_cen], i)
                else:
                    removed= np.append(removed, i)
            break
        
        max_ws= np.where(w==max_w)[0]
        min_a= np.argmin(a[max_ws])
        ind= max_ws[min_a]
        cluster_centers= np.append(cluster_centers, remain[ind])
        _, cluster_samples= density(ind, mdis, dists)
        clusters.append(remain[cluster_samples])
        remain= np.delete(remain, cluster_samples)
        mdists= np.append(mdists, mdis)

        cluster_idx+= 1
        print("No.of elements remaining: ", len(remain))
        print('\n')
    
    return cluster_centers, clusters, removed

In [142]:
init_labels= -1*np.ones(n).astype(np.int64)
incl= np.delete(np.arange(n), removed)
incl_labels= np.argmin(distances[:,cluster_centers][incl], axis=1)
init_centroids= np.empty((k, m))
for i in range(k):
    cent= np.mean(data[incl][np.where(incl_labels==i)[0]], axis=0)
    init_centroids[i]= cent

for i in range(len(incl)):
    init_labels[incl[i]]= incl_labels[i]

cl=0
for i in range(k):
    init_centroids[cl]= np.mean(data[np.where(init_labels==i)[0]], axis=0)
    cl+=1

In [143]:
def new_centers(data, labels, k):
    centers = data.groupby(labels).mean() 
    return centers.values   

def get_labels_with_hybrid_distance(dataset, cluster_centers, wc, we):
    hybrid_distances = np.zeros((len(dataset), len(cluster_centers)))

    
    for i, data_point in enumerate(dataset):
        for j, center in enumerate(cluster_centers):
            hybrid_distances[i, j] = hybrid_distance(data_point, center, wc, we)

    
    return np.argmin(hybrid_distances, axis=1)

In [145]:

for i in removed:
    cen_dists= np.array([])
    for j in centers:
        cen_dists= np.append(cen_dists, hybrid_distance(data[i], j, w1, 1-w1))
    labels= np.insert(labels, i, np.argmin(cen_dists))
_,counts= np.unique(labels, return_counts=True)

print("No.of clusters formed : ",len(counts))

No.of clusters formed :  2
