In [None]:
import random
import math
import json
import sys
import time
import numpy as np
import pandas as pd
import os
import csv


def load(file_name):
    # data(list of list): [[index, dimensions], [.., ..], ...]
    data = []
    fh = open(file_name)
    for line in fh:
        line = line.strip().split(',')
        temp = [int(line[0])]
        for feature in line[1:]:
            temp.append(float(feature))
        data.append(temp)
    data=np.array(data)
    return data

def get_sample(data):
    length = len(data)
    sample_size = int(length * 0.01)
    random_nums = set()
    sample_data = []

    for i in range(sample_size):
        random_index = random.randint(0, length - 1)
        while random_index in random_nums:
            random_index = random.randint(0, length - 1)
        random_nums.add(random_index)
        sample_data.append(data[random_index])
    sample_data = np.array(sample_data)
    return sample_data

def initialize_centroids(data, dimension, k):
    centroids = [[0 for _ in range(dimension)] for _ in range(k)]
    max_feature_vals = [0 for _ in range(dimension)]
    min_feature_vals = [float('inf') for _ in range(dimension)]
    data=data[:,1:]
    # TO DO
    # Calculate max feature and min feture value for each dimension
    for i in range(dimension):
        dim=data[i]
        min_feature_vals.append(np.amean(dim))
        max_feature_vals.append(np.amax(dim))
    #diff: max - min for each dimension
    diff=max_feature_vals-min_feature_vals
    # for each centroid, in each dimension assign centroids[j][i] = min_feature_val + diff * random.uniform(1e-5, 1)
    for i in range(len(centroids)):
        for j in range(len(centroids[i])):
            centroids[j][i]=min_feature_vals+diff*random.uniform(1*np.exp(-5),1)
    return centroids

def initialize_centroids_simple(data, dimension, k):

    #centroids: [(centroid0 fearures); (centroid0 features); ... ..]
    centroids =np.array( [[0 for _ in range(dimension)] for _ in range(k)])
    #TO DO
    #Write your code to return initialized centroids by randomly assiging them to K points
    row,column=data.shape
    randidx = np.random.permutation(row)

    for i in range(k):
        centroids[i]=data[randidx[i],1:]

    return centroids

def get_euclidean_distance(p1, p2):
    distance = -1.0
    #Write your code
    point=p2[1:]
    distance=np.linalg.norm(p1-point)
    return distance

def kmeans(data, dimension, k):
    #centroids: [(centroid0 fearures); (centroid1 features); ... ..]
    centroids = initialize_centroids_simple(data, dimension, k)
    #centroids= initialize_centroids(data,dimension,k)
    #cluster_affiliation: [((point1index  features),clusterindex); ((point2index features), clusterindex)... ]
    cluster_affiliation = [[tuple(features), None] for features in data]
    flag = 1
    j=0
    # count=0
    # while count!=4:
    #     count+=1
    while flag:
        for i, point in enumerate(data):
            min_distance = float('inf')
            min_distance_index = None

            #find closest centroids for each data points
            for cluster_index, centroid in enumerate(centroids):
                if centroid[0] == None:
                    continue
                distance = get_euclidean_distance(centroid, point)
                if distance < min_distance:
                    min_distance = distance
                    min_distance_index = cluster_index

            #record or update cluster for each data points
            if cluster_affiliation[i][1] != min_distance_index:
               cluster_affiliation[i][1] = min_distance_index

        #recompute centroids
        centroids = np.array([[0 for _ in range(dimension)] for _ in range(k)])

        clutser_point_count = np.array([0 for _ in range(k)])
        #TO DO
        #write your code to count each cluster pointcount and store them in clutser_point_count structure
        #recompute centroids using the count

        for i in range(len(cluster_affiliation)):
            if cluster_affiliation[i][1]==0:
                clutser_point_count[0]+=1
                point=np.array(data[i,1:])
                centroids[0]=centroids[0]+point
            elif cluster_affiliation[i][1]==1:
                clutser_point_count[1]+=1
                point=np.array(data[i,1:])
                centroids[1]=centroids[1]+point
            elif cluster_affiliation[i][1]==2:
                clutser_point_count[2]+=1
                point=np.array(data[i,1:])
                centroids[2]=centroid[2]+point
            else:
                clutser_point_count[3]+=1
                point=np.array(data[i,1:])
                centroids[3]=centroids[3]+point


        for i in range(k):
            centroids[i]=centroids[i]/clutser_point_count[i]




    #TO DO
    #Terminate the while loop based on termination criteria. Write your code to turn flag = false


    for i in data:
        min=i-centroids[0]
        for d in range(k):
            store=i-centroid[d]
            store=(np.linalg.norm(store)**2)
            if(min>store):
                min=store
        j+=min
        if(i==data[0,:]):
            prev=j
        # print(j," ," ,prev)
        a=abs(j-prev)
        b=((10**-5)*j)
        if(a<=b):
            flag=0


    return (centroids, cluster_affiliation)


def main():
    start = time.time()

    #input path of the real data
    #data file contains point index and 50 features in each line separated by comma
    inputpath =  'F://data'
    K = 4 # K clusters
    output1 = 'out1.csv'
    output2 = 'out2.csv'

    data_num = 0
    data = load(inputpath + '/data' + str(data_num) + '.txt')

    dimension = len(data[0]) - 1

    #sampling data from the data file
    sample_data = get_sample(data)

    centroids, cluster_affiliation = kmeans(sample_data, dimension, K)
    print(centroids.shape)
    f=open('F://data/out1.csv','w')
    writer = csv.writer(f)
    for i in centroids:
        writer.writerow(i)
    f.close()

    f=open('F://data/out2.csv','w')
    writer = csv.writer(f)
    for i in cluster_affiliation:
        writer.writerow(i)
    f.close()
if __name__ == "__main__":
				main()
