In [639]:
import numpy as np
from scipy.spatial.distance import cdist
import pandas as pd
DISTANCE_TYPE = 'euclidean'

In [640]:
def kmeans(dataset, k, no_of_iterations):
    indices = np.random.choice(len(dataset), k, replace = False)
    # print(indices)
    # choose the rows corresponding to indices which is randomly selected
    centroids = dataset.iloc[indices, :]
    # the below line finds the distance between centroids and all the datapoints
    distances = cdist(dataset, centroids, DISTANCE_TYPE)
    # print(centroids)
    # print(distances)
    # structure of distance => [[for one point distace from each centroid], ...]
    # the below line assigns each point with the nearest centroid
    points = np.array([np.argmin(dist_from_each_centroid) for dist_from_each_centroid in distances])
    
    # the main algo 
    for iteration in range(no_of_iterations):
        # below array will store the centroids
        centroids = []
        # finding the new centroid for each of the k clusters
        for cluster in range(k):
            temp_centroid = dataset[points == cluster].mean(axis = 0)
            centroids.append(temp_centroid)
        
        # new centroids
        centroids = np.vstack(centroids)

        distances = cdist(dataset, centroids, DISTANCE_TYPE)
        points = np.array([np.argmin(dist_from_each_centroid) for dist_from_each_centroid in distances])
    
    return points, centroids





In [641]:
dataset5 = pd.read_csv(r'processed_covid_data.csv')

In [642]:
def find_closest_centroids(dataset, centroids):
    distances = cdist(dataset, centroids, DISTANCE_TYPE)
    points = np.array([np.argmin(dist_from_each_centroid) for dist_from_each_centroid in distances])
    return points

In [643]:
# !pip install ipynb

In [644]:
from ipynb.fs.full.TREE import DecisionTree # before importing comment DecisionTree() in TREE.ipynb

In [645]:
def get_indices(indexes, x):
    indices = []
    for i in indexes:
        if i == x:
            indices.append(True)
        else:
            indices.append(False)
    return indices

In [646]:
class Forest:
    def __init__(self, dataset, k, num_of_iterations, TARGET_COLUMN = 'new_cases_classes'):
        self.k = k
        self.num_of_iterations = num_of_iterations
        self.indexes, self.centroids = kmeans(dataset.drop([TARGET_COLUMN], axis = 1), k, num_of_iterations)
        self.dataset = dataset
        self.DTs = self.create_trees()
        self.TARGET_COLUMN = TARGET_COLUMN

    def prepare_data_util(self, x):
        dataset1 = self.dataset[get_indices(self.indexes, x)]
        dataset1.reset_index(drop=True, inplace=True)
        return dataset1

    def prepare_data(self):
        datasets = []
        for i in range(self.k):
            datasets.append(self.prepare_data_util(i))
        return datasets

    def create_trees(self):
        DTs = []
        datasets = self.prepare_data()
        # print(len(datasets))
        for dataset in datasets:
            DTs.append(DecisionTree(dataset))
        return DTs

    def calculate_average(self,  all_predictions, centroids):
        average_predicted = []
        # print(all_predictions)
        row, col = len(all_predictions), len(all_predictions[0])
        # print(row,col)
        for i in range(col):
            sum = 0
            average_predicted.append(all_predictions[centroids[i]][i])

        return(average_predicted)

    def predict(self, testing_data):
        all_predictions = []
        # print(self.DTs)
        for DT in self.DTs:
            individual_prediction = DT.predict(testing_data)
            all_predictions.append(individual_prediction)
        # print(all_predictions)
        closest_centroids = find_closest_centroids(testing_data.drop([self.TARGET_COLUMN], axis = 1), self.centroids)
        final_predictions = self.calculate_average(all_predictions, closest_centroids)
        return final_predictions
    
    def calculate_accuracy(self, testing_data, THRESHOLD_FOR_ACCURACY = 1):
        predictions = self.predict(testing_data)
        score = 0
        # THRESHOLD_FOR_ACCURACY = 1
        for i in range(len(testing_data)):
            actual = testing_data.at[i, self.TARGET_COLUMN]
            predicted = predictions[i]
            # print(actual, predicted[0])
            score += (abs(actual - predicted[0]) <= THRESHOLD_FOR_ACCURACY)

        print(score / len(testing_data) * 100)


In [647]:
training_data = dataset5.sample(frac = 0.7)
testing_data = dataset5.drop(training_data.index)
training_data.reset_index(drop=True, inplace=True)
testing_data.reset_index(drop=True, inplace=True)

In [648]:
# testing_data.head()

In [649]:
forest = Forest(training_data, k = 5, num_of_iterations = 5000)

In [650]:
predictions = forest.calculate_accuracy(testing_data)

74.46808510638297
