<a href="https://colab.research.google.com/github/AmaanAmythAlegend/summer-of-code-2023/blob/main/machine-learning/week1/DecisionTrees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("ndtv_data_final (1).csv")

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Brand,Model,Battery capacity (mAh),Screen size (inches),Touchscreen,Resolution x,Resolution y,Processor,...,Rear camera,Front camera,Operating system,Wi-Fi,Bluetooth,GPS,Number of SIMs,3G,4G/ LTE,Price
0,0,OnePlus 7T Pro McLaren Edition,OnePlus,7T Pro McLaren Edition,4085,6.67,Yes,1440,3120,8,...,48.0,16.0,Android,Yes,Yes,Yes,2,Yes,Yes,58998
1,1,Realme X2 Pro,Realme,X2 Pro,4000,6.5,Yes,1080,2400,8,...,64.0,16.0,Android,Yes,Yes,Yes,2,Yes,Yes,27999
2,2,iPhone 11 Pro Max,Apple,iPhone 11 Pro Max,3969,6.5,Yes,1242,2688,6,...,12.0,12.0,iOS,Yes,Yes,Yes,2,Yes,Yes,106900
3,3,iPhone 11,Apple,iPhone 11,3110,6.1,Yes,828,1792,6,...,12.0,12.0,iOS,Yes,Yes,Yes,2,Yes,Yes,62900
4,4,LG G8X ThinQ,LG,G8X ThinQ,4000,6.4,Yes,1080,2340,8,...,12.0,32.0,Android,Yes,Yes,Yes,1,No,No,49990


In [None]:
class Node():
  def __init__(self,f_index = None, threshold = None, left = None, right = None, var_red = None, value = None):
    self.f_index = f_index
    self.threshold = threshold
    self.left = left
    self.right = right
    self.var_red = var_red

    self.value = value

In [None]:
class DecisionTree():
  def __init__(self, min_split, max_depth):
    self.root = None
    self.min_split = min_split
    self.max_depth = max_depth

  def build_tree(self,dataset,depth = 0):
    Y = dataset[:,-1]
    X = dataset[:,:-1]
    num_samples, num_features = np.shape(X)
    if num_samples>=self.min_split and depth<=self.max_depth:
      best_split = self.get_best_split(dataset, num_samples, num_features)
      if best_split["var_red"]>0:
        left_subtree = self.build_tree(best_split["dataset_left"],depth+1)
        right_subtree = self.build_tree(best_split["dataset_right"],depth+1)
        return Node(best_split["feature_index"], best_split["threshold"],left_subtree, right_subtree, best_split["var_red"])

    leaf_value = self.calculate_leaf_value(Y)
    return Node(value = leaf_value)

  def get_best_split(self,dataset,num_samples,num_features):
    best_split = {}
    max_var_red = -float("inf")
    for feature_index in [4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]:
      feature_values = dataset[:,feature_index]
      possible_thresholds = np.unique(feature_values)
      for threshold in possible_thresholds:
        if feature_index in [6, 14, 15, 16, 17, 19, 20]:
          dataset_left, dataset_right = self.split1(dataset, feature_index, threshold)
        else:
          dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
        if len(dataset_left)>0 and len(dataset_right)>0:
          y, left_y, right_y = dataset[:,-1], dataset_left[:,-1], dataset_right[:,-1]
          curr_var_red = self.variance_reduction(y, left_y, right_y)
          if curr_var_red>max_var_red:
            best_split["feature_index"] = feature_index
            best_split["threshold"] = threshold
            best_split["dataset_left"] = dataset_left
            best_split["dataset_right"] = dataset_right
            best_split["var_red"] = curr_var_red
            max_var_red = curr_var_red
    return best_split

  def split(self, dataset, feature_index, threshold):
    dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
    return dataset_left, dataset_right

  def split1(self, dataset, feature_index, threshold):
    dataset_left = np.array([row for row in dataset if row[feature_index]==threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index]!=threshold])
    return dataset_left, dataset_right

  def variance_reduction(self, parent, l_child, r_child):
    weight_l = len(l_child)/len(parent)
    weight_r = len(r_child)/len(parent)
    reduction = np.var(parent) - (weight_l*np.var(l_child) + weight_r*np.var(r_child))
    return reduction

  def calculate_leaf_value(self,Y):
    val = np.mean(Y)
    return val

  def print_tree(self, tree=None, indent=" "):
    if not tree:
      tree = self.root

    if tree.value is not None:
      print(tree.value)
    else:
      print("X_"+str(tree.f_index), "threshold:", tree.threshold, "variance reduction:", tree.var_red)
      print("%sleft:" % (indent), end="")
      self.print_tree(tree.left, indent + "    ")
      print("%sright:" % (indent), end="")
      self.print_tree(tree.right, indent + "    ")

  def fit(self, X, Y):
    dataset = np.concatenate((X,Y), axis = 1)
    self.root = self.build_tree(dataset)

  def make_prediction(self, x, tree):
    if tree.value!=None: return tree.value
    feature_val = x[tree.f_index]
    if tree.f_index in [1, 5, 13, 14, 15]:
      if feature_val==tree.threshold:
        return self.make_prediction(x,tree.left)
      else:
        return self.make_prediction(x,tree.right)
    else:
      if feature_val<=tree.threshold:
        return self.make_prediction(x,tree.left)
      else:
        return self.make_prediction(x, tree.right)

  def predict(self, X):
    predictions = [self.make_prediction(x, self.root) for x in X]
    return predictions

In [None]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [None]:
regressor = DecisionTree(min_split=2, max_depth=5)
regressor.fit(X_train,Y_train)
regressor.print_tree()

In [None]:
Y_pred = regressor.predict(X_test)
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(Y_test, Y_pred)))
sum = 0
min = 0
for i in range(len(Y_test)):
  print("Model Name:", X_test[i][0], X_test[i])
  print("Predicted:", Y_pred[i])
  print("Actual:", Y_test[i])
  if (abs(Y_test[i] - Y_pred[i])*100/Y_test[i])<10:
    min+=1
  sum+=(abs(Y_test[i] - Y_pred[i])*100/Y_test[i])
  print(abs(Y_test[i] - Y_pred[i])*100/Y_test[i])
  print()