In [6]:
import numpy as np
from scipy.spatial.distance import cdist
import pandas as pd
DISTANCE_TYPE = 'euclidean'

In [7]:
def kmeans(dataset, k, no_of_iterations):
    indices = np.random.choice(len(dataset), k, replace = False)
    # print(indices)
    # choose the rows corresponding to indices which is randomly selected
    centroids = dataset.iloc[indices, :]
    # the below line finds the distance between centroids and all the datapoints
    distances = cdist(dataset, centroids, DISTANCE_TYPE)
    # print(centroids)
    # print(distances)
    # structure of distance => [[for one point distace from each centroid], ...]
    # the below line assigns each point with the nearest centroid
    points = np.array([np.argmin(dist_from_each_centroid) for dist_from_each_centroid in distances])
    
    # the main algo 
    for iteration in range(no_of_iterations):
        # below array will store the centroids
        centroids = []
        # finding the new centroid for each of the k clusters
        for cluster in range(k):
            temp_centroid = dataset[points == cluster].mean(axis = 0)
            centroids.append(temp_centroid)
        
        # new centroids
        centroids = np.vstack(centroids)

        distances = cdist(dataset, centroids, DISTANCE_TYPE)
        points = np.array([np.argmin(dist_from_each_centroid) for dist_from_each_centroid in distances])
    
    return points





In [8]:
dataset5 = pd.read_csv(r'processed_covid_data.csv')

In [9]:
indexes = kmeans(dataset5, 3, 10)

In [3]:
# !pip install ipynb

Collecting ipynb
  Downloading ipynb-0.5.1-py3-none-any.whl (6.9 kB)
Installing collected packages: ipynb
Successfully installed ipynb-0.5.1


In [21]:
from ipynb.fs.full.TREE import DecisionTree, testing_data, training_data # before importing comment DecisionTree() in TREE.ipynb

In [12]:
indexes

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [17]:
def get_indices(indexes, x):
    indices = []
    for i in range(len(indexes)):
        if (indexes[i] == x):
            indices.append(True)
        else:
            indices.append(False)
    return indices

In [18]:
dataset1 = dataset5[get_indices(indexes, 0)]
dataset2 = dataset5[get_indices(indexes, 1)]
dataset3 = dataset5[get_indices(indexes, 2)]

In [19]:
DT1 = DecisionTree(dataset=dataset1)
DT2 = DecisionTree(dataset=dataset2)
DT3 = DecisionTree(dataset=dataset3)

In [22]:
DT1.calculate_accuracy(testing_data=testing_data)

48.226950354609926


In [23]:
DT2.calculate_accuracy(testing_data=testing_data)

2.8368794326241136


In [24]:
DT3.calculate_accuracy(testing_data=testing_data)

43.262411347517734
