In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
import math
import random

In [2]:
data = pd.read_csv("PercentageIncreaseCOVIDWorldwide.csv")
data = data.drop(0,0)
data = data.drop("Date",1)
data = data.reset_index(drop = True)

In [3]:
X = data.drop("Increase rate",1)
Y = data["Increase rate"]

In [4]:
def traintestsplit(x, y, test_size = 0.2, random_state = None):

    """ partioning the data into train and test sets """

    x_test = x.sample(frac = test_size, random_state = random_state)
    y_test = y[x_test.index]

    x_train = x.drop(x_test.index)
    y_train = y.drop(y_test.index)

    return x_train, x_test, y_train, y_test

In [5]:
def find_rmse(y_pred,y_test):
    return np.sqrt((np.subtract(y_pred, y_test) ** 2).sum()/len(y_test))

def find_r2(y_pred,y_test):
    num = 0
    deno = 0
    for i in range (0,len(y_pred)):
        num = num + (y_pred[i]-y_test[i])**2
        deno = deno + (y_test[i]-y_test.mean())**2
    
    return 1-(num/deno)

In [6]:
class Decision_Tree_Regeressor:

    def __init__(self, max_depth = None, min_sample_leaf = 2):

        self.depth = 0 #Depth of the tree
        self.max_depth = max_depth	#Maximum depth of the tree
        self.min_sample_leaf = min_sample_leaf	#Minimum number of samples for each node
        self.features = list
        self.X_train = np.array
        self.y_train = np.array
        self.num_feats = int 
        self.train_size = int 

    def fit(self, X, y):

        self.X_train = X 
        self.y_train = y
        self.features = list(X.columns)
        self.train_size = X.shape[0]
        self.num_feats = X.shape[1]

        df = X.copy()
        df['target'] = y.copy()

        #Builds Decision Tree
        self.tree = self._build_tree(df)

    def _build_tree(self, df, dep=0,tree = None):

        """
        Args:
        df: current number of rows available for splitting(decision making)
        """

        #Get feature with minimum score
        feature, cutoff = self._find_best_split(df)

        if cutoff is None:
            return tree

        #Initialization of tree
        if tree is None:
            tree = {}
            tree[feature] = {}

        #Left Child
        if feature is not None:
            new_df = self._split_rows(df, feature, cutoff, operator.le)
            new_df_right = self._split_rows(df, feature, cutoff, operator.gt)

            if(len(new_df) <= self.min_sample_leaf): #pure group
                tree[feature]['<=' + str(cutoff)] = new_df['target'].mean()
            else:
                if self.max_depth is not None and dep >= self.max_depth:
                    tree[feature]['<=' + str(cutoff)] = new_df['target'].mean()
                    self.depth = max(self.depth,dep)

                else:
                    self.depth = max(self.depth,dep)
                    tree[feature]['<=' + str(cutoff)] = self._build_tree(new_df,dep+1)
            

            #Right Child


            if(len(new_df_right) <= self.min_sample_leaf): #pure group
                tree[feature]['>' + str(cutoff)] = new_df_right['target'].mean()
            else:
                if self.max_depth is not None and dep >= self.max_depth:
                    tree[feature]['>' + str(cutoff)] = new_df_right['target'].mean()
                    self.depth = max(self.depth,dep)

                else:
                    self.depth = max(self.depth,dep)
                    tree[feature]['>' + str(cutoff)] = self._build_tree(new_df_right,dep+1)
        return tree

    def _split_rows(self, df, feature, feat_val, operation ):

        """ split rows based on given criterion """

        return df[operation(df[feature], feat_val)].reset_index(drop = True)

    def _find_best_split(self, df):

        """
            Finds the column to split on first.
        """

        best_feature = None
        cutoff = None
        best_score = float('-inf')

        for feature in list(df.columns[:-1]):

            score, threshold = self._find_feature_split(feature, df)

            if score > best_score:
                best_feature = feature
                best_score = score
                cutoff = threshold
        
        return best_feature, cutoff 

    def _find_feature_split(self, feature, df):

        best_score = float('-inf')
        cutoff = float

        for val in df[feature]:
            left_child = df[feature][df[feature] <= val] 
            right_child = df[feature][df[feature] > val]

            if(len(left_child) > 0 and len(right_child) > 0):
                score = self._find_score(df, left_child, right_child)

                if score > best_score:
                    best_score = score
                    cutoff = val

        return best_score, cutoff


    def _find_score(self, df, lhs, rhs):

        y = df['target']
        merge = lhs+rhs
        lhs_std = y.iloc[lhs.index].var()
        rhs_std = y.iloc[rhs.index].var()
        merge_std = y.iloc[merge.index].var()
        
        if(np.isnan(lhs_std)):
            lhs_std = 0
        if(np.isnan(rhs_std)):
            rhs_std = 0
        if(np.isnan(merge_std)):
            merge_std = 0
        
        term = lhs_std*len(lhs) + rhs_std *len(rhs)
        if len(merge)==0:
            return 0
        else:
            term = term/len(merge)
            return merge_std-term
    
    def _predict_target(self, feature_lookup, x, tree):

        for node in tree.keys():
            val = x[node]
            if type(val) == str:
                tree = tree[node][val]
            else:
                cutoff = str(list(tree[node].keys())[0]).split('<=')[1]

                if(val <= float(cutoff)):  #Left Child
                    tree = tree[node]['<='+cutoff]
                else:                      #Right Child
                    tree = tree[node]['>'+cutoff]

            prediction = str

            if type(tree) is dict:
                prediction = self._predict_target(feature_lookup, x, tree)
            else:
                predicton = tree 
                return predicton

        return prediction   


    def predict(self, X):

        results = []
        feature_lookup = {key: i for i, key in enumerate(list(X.columns))}
        
        for index in range(len(X)):

            results.append(self._predict_target(feature_lookup, X.iloc[index], self.tree))

        return np.array(results)
    
    def display(self):
        print("\nDecision Tree (depth={}) : \n {}".format(self.depth,self.tree))

In [7]:
def input_depth():
    inp_depth = int(input("Enter depth "))
    if inp_depth==-1:
        inp_depth = None
        
    return inp_depth

def single_split(depth):
    rand_seeding = random.randint(20,50)
    x_train,x_test,y_train,y_test = traintestsplit(X,Y,test_size=0.2,random_state=rand_seeding)
    x_train = x_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    regressor = Decision_Tree_Regeressor(max_depth = depth)
    regressor.fit(x_train,y_train)
    y_pred = regressor.predict(x_test)
    
    return find_r2(y_pred,y_test),x_train,x_test,y_train,y_test

In [8]:
inp = input_depth()
avg = 0
mx = 0
for i in range (10):
    score,xtrain,xtest,ytrain,ytest = single_split(inp)
    avg = avg + score
    if score>mx:
        mx = score
        best_xtrain = xtrain
        best_ytrain = ytrain
        best_xtest = xtest
        best_ytest = ytest
        
print(avg/10)
print(mx)

Enter depth -1
0.5020194743402869
0.8816416851665729


In [None]:
train_predictions = []
test_predictions = []
x_axis = []
for i in range(0,10):
    x_axis.append(i*2)
    best_regressor = Decision_Tree_Regeressor(max_depth = i*2)
    best_regressor.fit(best_xtrain,best_ytrain)
    best_ypred_test = best_regressor.predict(best_xtest)
    best_ypred_train = best_regressor.predict(best_xtrain)
    train_predictions.append(find_r2(best_ypred_train,best_ytrain))
    test_predictions.append(find_r2(best_ypred_test,best_ytest))
    

In [None]:
plt.plot(x_axis,train_predictions,'r')
plt.plot(x_axis,test_predictions,'b')
plt.show()