In [104]:
# %load Main_script2.py
"""
Created on Thu May  5 16:17:53 2022

@author: salih
"""

from cmath import sqrt

from matplotlib.pyplot import axis
from sklearn.cluster import KMeans
from elbow import calculateKWithElbow

import numpy as np
import pandas as pd
import scipy.spatial.distance as metric
import math
import sklearn.datasets as datasets
import time
import warnings
warnings.filterwarnings("ignore")

iris = pd.read_csv("ruspini.csv")


def calcSSE(data, centroids):
    sum = 0
    for i in data:
        distance = math.inf
        for k in centroids:
            if euc(i, k) < distance:
                distance = euc(i, k)
        sum += distance ** 2

    return sum / len(data)


def euc(A, B):
    # Call to scipy with vector parameters

    return metric.euclidean(A, B)


def rand_cent(ds, k):
    # Number of columns in dataset
    n = np.shape(ds)[1]

    # The centroids
    centroids = np.mat(np.zeros((k, n)))

    # Create random centroids
    for j in range(n):

        min_j = min(ds[:, j])
        range_j = float(max(ds[:, j]) - min_j)
        centroids[:, j] = min_j + range_j * np.random.rand(k, 1)

    # Return centroids as numpy array
    return centroids


def kmeans(ds, k, cent_method):
    global timer_start
    global timer_end
    global total_timer_end
    return_object = {}
    global cents
    global sse
    global iters
    cents = []

    if cent_method == "random":
        timer_start = time.perf_counter()
        cents = rand_cent(ds, k)
        timer_end = time.perf_counter()
    elif cent_method == "naive":
        timer_start = time.perf_counter()
        cents = naive_sharding(ds, k)
        timer_end = time.perf_counter()
    elif cent_method == "mean":
        timer_start = time.perf_counter()
        cents = mean_sharding(ds, k)
        timer_end = time.perf_counter()
    elif cent_method == "median":
        timer_start = time.perf_counter()
        cents = median_sharding(ds, k)
        timer_end = time.perf_counter()
    elif cent_method == "minmax":
        timer_start = time.perf_counter()
        cents = minmaxsharding(ds, k)
        timer_end = time.perf_counter()
    km = KMeans(n_clusters=k, init=cents).fit(ds)
    total_timer_end = time.perf_counter()
    iters = km.n_iter_
    cents = km.cluster_centers_
    sse = km.inertia_
    return_object['cents'] = cents
    return_object['time'] = timer_end - timer_start
    return_object['total-time'] = total_timer_end - timer_start
    return_object['sse'] = sse / len(ds)
    return_object['type'] = cent_method
    return_object['iter'] = iters
    return return_object


def _get_mean(sums, step):
    return sums/step


def naive_sharding(ds, k):
    n = np.shape(ds)[1]

    m = np.shape(ds)[0]

    centroids = np.mat(np.zeros((k, n)))

    composite = np.sum(ds, axis=1)
    composite = np.reshape(composite, (len(ds), 1))

    ds = np.append(composite, ds, axis=1)

    ds.sort(axis=0)
    step = math.floor(m/k)

    vfunc = np.vectorize(_get_mean)

    for j in range(k):
        if j == k-1:
            centroids[j:] = vfunc(np.sum(ds[j*step:, 1:], axis=0), step)
        else:
            centroids[j:] = vfunc(
                np.sum(ds[j*step:(j+1)*step, 1:], axis=0), step)

    return centroids


def mean_sharding(ds, k):
    n = np.shape(ds)[1]

    m = np.shape(ds)[0]

    centroids = np.mat(np.zeros((k, n)))

    composite = np.mean(ds, axis=1)
    composite = np.reshape(composite, (len(ds), 1))

    ds = np.append(composite, ds, axis=1)

    # ds = ds[ds[:, 0].argsort(kind="mergesort")]
    ds.sort(axis=0)

    step = math.floor(m/k)

    vfunc = np.vectorize(_get_mean)

    for j in range(k):
        if j == k-1:
            centroids[j:] = vfunc(np.sum(ds[j*step:, 1:], axis=0), step)
        else:
            centroids[j:] = vfunc(
                np.sum(ds[j*step:(j+1)*step, 1:], axis=0), step)

    return centroids


def median_sharding(ds, k):
    n = np.shape(ds)[1]

    m = np.shape(ds)[0]

    centroids = np.mat(np.zeros((k, n)))

    composite = np.median(ds, axis=1)
    composite = np.reshape(composite, (len(ds), 1))

    ds = np.append(composite, ds, axis=1)

    # ds = ds[ds[:, 0].argsort()]
    ds.sort(axis=0)

    step = math.floor(m/k)

    vfunc = np.vectorize(_get_mean)

    for j in range(k):
        if j == k-1:
            centroids[j:] = vfunc(np.sum(ds[j*step:, 1:], axis=0), step)
        else:
            centroids[j:] = vfunc(
                np.sum(ds[j*step:(j+1)*step, 1:], axis=0), step)

    return centroids


def minmaxsharding(ds, k):

    n = np.shape(ds)[1]

    centroids = np.mat(np.zeros((k, n)))

    composite = np.sum(ds, axis=1)

    composite = np.reshape(composite, (len(ds), 1))

    ds = np.append(composite, ds, axis=1)
    # print(ds)

    # ds = ds[ds[:, 0].argsort()]
    ds.sort(axis=0)

    # print(ds)
    ds_range = np.max(ds[:, 0])-np.min(ds[:, 0])

    #threshold = math.ceil(ds_range/k)
    threshold=ds_range/k
    prev_arr = split_arr(ds, threshold, 0)

    for j in range(k):
        # print(prev_arr[1])
        centroids[j, :] = np.sum(
            prev_arr[1][:, 1:], axis=0)/np.shape(prev_arr[1])[0]
        # print(centroids)

        prev_arr = split_arr(ds[prev_arr[0]:, :], threshold, prev_arr[0])
        # print("done")

    return centroids


def split_arr(ds, threshold, j):
    if np.size(ds) == 0:
        return None
    min_val = ds[0, 0]

    k = 0
    for i in range(len(ds)):
        if ds[k, 0]-min_val <= threshold:
            # print(k)
            k += 1
        else:
            break

    return [j+k, ds[0:k, :]]


def printResult(datas):
    print("{:<20} {:<20} {:<20} {:<20} {:<20}".format(
        'Type', 'Time', "SSE", "Total Time", "Iter"))
    print('-----------------------------------------------------------------------------------')
    for d in datas:
        print("{:<20} {:<20} {:<20} {:<20} {:<20}".format(
            d['type'], d['time'], d['sse'], d['total-time'], d['iter']))


#df = pd.DataFrame(iris.data) yapmamıza gerek yok çünkü iris = pd.read_csv("ruspini.csv") deki iris zaten df
df = iris.iloc[:, [1,2]]
#df=iris
df = df.to_numpy()
printResult([kmeans(df, 4, 'random'), kmeans(
    df, 4, 'minmax'), kmeans(df, 4, 'median'), kmeans(df, 4, 'mean'), kmeans(df, 4, 'naive')])


Type                 Time                 SSE                  Total Time           Iter                
-----------------------------------------------------------------------------------
random               0.0001140000531449914 700.346521885522     0.002476300112903118 6                   
minmax               0.000305499997921288 661.2112190963343    0.0031609999714419246 4                   
median               0.00043810007628053427 225.4453264234756    0.00295290001668036  4                   
mean                 0.00033460010308772326 225.4453264234756    0.002657800097949803 4                   
naive                0.0003085000207647681 225.4453264234756    0.002179499948397279 4                   


In [105]:
names=['random','minmax','median','mean','naive']
topl=[0]*5
topl2=[0]*5
j=0
for i in range (100):
    a=kmeans(df, 4, 'random')
    topl[j]=topl[j]+a['sse']
    topl2[j]=topl2[j]+a['iter']
    a=kmeans(df, 4, 'minmax')
    topl[j+1]=topl[j+1]+a['sse']
    topl2[j+1]=topl2[j+1]+a['iter']
    a=kmeans(df, 4, 'median')
    topl[j+2]=topl[j+2]+a['sse']
    topl2[j+2]=topl2[j+2]+a['iter']
    a=kmeans(df, 4, 'mean')
    topl[j+3]=topl[j+3]+a['sse']
    topl2[j+3]=topl2[j+3]+a['iter']
    a=kmeans(df, 4, 'naive')
    topl[j+4]=topl[j+4]+a['sse']
    topl2[j+4]=topl2[j+4]+a['iter']    
avrg=[0]*5
avrg2=[0]*5
print("Type \t \t SSE \t \t\t\t iter")
for k in range (5):
    avrg[k]=topl[k]/100
    avrg2[k]=topl2[k]/100
    print(names[k],"\t\t",avrg[k],"\t \t",avrg2[k])

Type 	 	 SSE 	 			 iter
random 		 350.3150344106723 	 	 4.29
minmax 		 661.2112190963325 	 	 4.0
median 		 225.44532642347593 	 	 4.0
mean 		 225.44532642347593 	 	 4.0
naive 		 225.44532642347593 	 	 4.0


In [106]:
df = iris.iloc[:, [1,2]]
#df=iris
df = df.to_numpy()
df=df.astype(float)
num=len(df[0])
for i in range(num):
    m=max(df[:,i])
    print(m)
    df[:,i]=df[:,i]/m

117.0
156.0


In [107]:
printResult([kmeans(df, 4, 'random'), kmeans(
    df, 4, 'minmax'), kmeans(df, 4, 'median'), kmeans(df, 4, 'mean'), kmeans(df, 4, 'naive')])

Type                 Time                 SSE                  Total Time           Iter                
-----------------------------------------------------------------------------------
random               0.000206699944101274 0.012118143593700278 0.0034691999899223447 4                   
minmax               0.000355200027115643 0.03829836015277137  0.0029597999528050423 3                   
median               0.0005777999758720398 0.012118143593700278 0.0032310999231413007 5                   
mean                 0.0003450000658631325 0.012118143593700278 0.0024060000432655215 5                   
naive                0.00030610000248998404 0.012118143593700278 0.002498700050637126 5                   


In [108]:
 a=math.ceil(0.017)
 print(a)

1
