In [16]:
import numpy as np
import pandas as pd

In [17]:
data = pd.read_csv("College.csv")
data.head(5)

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [18]:
def build_tree(dataset, min_samples_split=2, max_depth=2, curr_depth=0):
    ''' recursive function to build the tree '''
    X, Y = dataset[:,:-1], dataset[:,-1]
    num_samples, num_features = np.shape(X)
    best_split = {}
    # split until stopping conditions are met
    if num_samples >= min_samples_split and curr_depth <= max_depth:
        # find the best split
        best_split = get_best_split(dataset, num_samples, num_features)
        # check if variance reduction is positive
        if best_split["var_red"] > 0:
            # recur left
            left_subtree = build_tree(best_split["dataset_left"], min_samples_split, max_depth, curr_depth + 1)
            # recur right
            right_subtree = build_tree(best_split["dataset_right"], min_samples_split, max_depth, curr_depth + 1)
            # return decision node
            return {'feature_index': best_split["feature_index"], 
                    'threshold': best_split["threshold"],
                    'left': left_subtree,
                    'right': right_subtree,
                    'var_red': best_split["var_red"]}
    
    # compute leaf node
    leaf_value = calculate_leaf_value(Y)
    # return leaf node
    return {'value': leaf_value}



In [19]:
def get_best_split(dataset, num_samples, num_features):
    ''' function to find the best split '''
    
    best_split = {}
    max_var_red = -float("inf")
    # loop over all the features
    for feature_index in range(num_features):
        feature_values = dataset[:, feature_index]
        possible_thresholds = np.unique(feature_values)
        # loop over all the feature values present in the data
        for threshold in possible_thresholds:
            # get current split
            dataset_left, dataset_right = split(dataset, feature_index, threshold)
            # check if child datasets are not empty
            if len(dataset_left) > 0 and len(dataset_right) > 0:
                y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                # compute variance reduction
                curr_var_red = variance_reduction(y, left_y, right_y)
                # update the best split if needed
                if curr_var_red > max_var_red:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = threshold
                    best_split["dataset_left"] = dataset_left
                    best_split["dataset_right"] = dataset_right
                    best_split["var_red"] = curr_var_red
                    max_var_red = curr_var_red
                    
    # return best split
    return best_split



In [20]:
def split(dataset, feature_index, threshold):
    ''' function to split the data '''
    
    dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
    return dataset_left, dataset_right



In [21]:
def variance_reduction(parent, l_child, r_child):
    ''' function to compute variance reduction '''
    
    weight_l = len(l_child) / len(parent)
    weight_r = len(r_child) / len(parent)
    reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
    return reduction



In [22]:
def calculate_leaf_value(Y):
    ''' function to compute leaf node '''
    val = np.mean(Y)
    return val



In [23]:
def print_tree(tree=None, indent=" "):
    ''' function to print the tree '''
    
    if not tree:
        return
    
    if 'value' in tree:
        print(tree['value'])
    else:
        print("X_" + str(tree['feature_index']), "<=", tree['threshold'], "?", tree['var_red'])
        print("%sleft:" % (indent), end="")
        print_tree(tree['left'], indent + indent)
        print("%sright:" % (indent), end="")
        print_tree(tree['right'], indent + indent)



In [24]:
def fit(X, Y, min_samples_split=2, max_depth=2):
    ''' function to train the tree '''
    
    dataset = np.concatenate((X, Y), axis=1)
    return build_tree(dataset, min_samples_split, max_depth)



In [34]:
def make_prediction(x, tree):
    ''' function to predict new dataset '''
    if 'value' in tree:
        return tree['value']
    feature_val = x[tree['feature_index']]
    if feature_val <= tree['threshold']:
        return make_prediction(x, tree['left'])
    else:
        return make_prediction(x, tree['right'])



In [49]:
def predict(X, tree):
    ''' function to predict a single data point '''
    return [make_prediction(x, tree) for x in X]
x_single = X_test[8]
predicted_value = predict([x_single], tree)[0]
actual_value = Y_test[8]

print("Predicted Value:", predicted_value)
print("Actual Value:", actual_value)


Predicted Value: 71.2258064516129
Actual Value: [65]


In [46]:
Y_test

array([[21],
       [92],
       [76],
       [65],
       [48],
       [58],
       [87],
       [68],
       [65],
       [95],
       [90],
       [93],
       [56],
       [66],
       [90],
       [64],
       [75],
       [54],
       [52],
       [86],
       [72],
       [54],
       [60],
       [60],
       [36],
       [85],
       [72],
       [78],
       [78],
       [80],
       [59],
       [98],
       [81],
       [61],
       [81],
       [54],
       [52],
       [64],
       [57],
       [67],
       [66],
       [47],
       [76],
       [73],
       [37],
       [34],
       [15],
       [46],
       [75],
       [82],
       [70],
       [63],
       [63],
       [78],
       [60],
       [63],
       [97],
       [72],
       [98],
       [51],
       [45],
       [67],
       [51],
       [67],
       [88],
       [72],
       [83],
       [75],
       [81],
       [70],
       [85],
       [46],
       [44],
       [47],
       [84],
       [45],
       [49],

In [31]:
## Train-Test split

data = pd.read_csv("College.csv")

X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)
data


Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,No,2197,1515,543,4,26,3089,2029,6797,3900,500,1200,60,60,21.0,14,4469,40
773,Yes,1959,1805,695,24,47,2849,1107,11520,4960,600,1250,73,75,13.3,31,9189,83
774,Yes,2097,1915,695,34,61,2793,166,6900,4200,617,781,67,75,14.4,20,8323,49
775,Yes,10705,2453,1317,95,99,5217,83,19840,6510,630,2115,96,96,5.8,49,40386,99


In [28]:
## Fit the model
tree = fit(X_train, Y_train, min_samples_split=3, max_depth=3)
print_tree(tree)

X_8 <= 10602 ? 78.66079935576414
 left:X_15 <= 16 ? 22.023573414704344
  left:X_16 <= 4322 ? 19.370741093241946
    left:X_1 <= 285 ? 217.79999999999998
        left:43.0
        right:82.6
    right:X_1 <= 5530 ? 14.161720226061078
        left:49.93846153846154
        right:59.0
  right:X_8 <= 4973 ? 17.76927021000347
    left:X_0 <= No ? 89.38020833333334
        left:48.833333333333336
        right:27.0
    right:X_6 <= 738 ? 13.251784230958435
        left:56.0
        right:64.97619047619048
 right:X_5 <= 52 ? 32.108380390455125
  left:X_15 <= 23 ? 49.70582151398395
    left:X_1 <= 443 ? 42.229430034416254
        left:42.2
        right:61.42424242424242
    right:X_11 <= 1120 ? 26.09483930211202
        left:75.92
        right:64.0
  right:X_1 <= 1283 ? 24.046139871393393
    left:X_6 <= 484 ? 13.301288283908562
        left:57.25
        right:71.29411764705883
    right:X_15 <= 19 ? 26.00999999999995
        left:71.2258064516129
        right:83.9758064516129


In [50]:
def calculate_rmse(y_true, y_pred):
    """Calculate the root mean squared error."""
    n = len(y_true)
    squared_errors = [(y_true[i] - y_pred[i]) ** 2 for i in range(n)]
    mean_squared_error = sum(squared_errors) / n
    rmse = mean_squared_error ** 0.5
    return rmse

# Calculate RMSE
rmse = calculate_rmse(Y_test, Y_pred)
print("Root Mean Squared Error (RMSE):", rmse)


Root Mean Squared Error (RMSE): [15.9139772]
