<a href="https://colab.research.google.com/github/CitricSpice98/Random-Forest-Classifier/blob/main/RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Online Assessment


Problem 1

Imports


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Data
Preprocessing


In [2]:
features = pd.read_csv('/content/student_dataset.csv')
labels = pd.read_csv('/content/generated_label_set.csv')

In [3]:
print("Features data:")
print(features.head())
print("\nLabels data:")
print(labels.head())

Features data:
              Name  SAT Score  High School GPA  Courses in Arts  \
0     Sarah Nelson        594             2.10                0   
1      James Marsh       1100             3.81                1   
2     James Galvan        494             2.06                1   
3  Russell Johnson       1246             3.69                2   
4       Maria Clay        726             1.22                1   

   Courses in STEM  Courses in Finance  Courses in Other  Age  Family Income  \
0                2                   2                 2   23         234000   
1                1                   3                 1   22          41000   
2                4                   1                 0   23          66000   
3                1                   1                 2   25          83000   
4                0                   3                 2   19         226000   

  Part Time Employment  Hours of Study  Hours of Extracurriculars  
0                   No           

Linear Regression


Random Forest

In [4]:
# Preprocessing
processed_features = features.drop(columns=['Name'])
processed_labels = labels.drop(columns=['Name'])

# Convert categorical variables to numerical
processed_features['Part Time Employment'] = processed_features['Part Time Employment'].map({'Yes': 1, 'No': 0})
processed_labels['Major']= processed_labels['Major'].map({'STEM': 0, 'Finance': 1,'Arts' : 2, 'Other' : 3})
processed_labels['Post Graduate Career']= processed_labels['Post Graduate Career'].map({'Out of industry employment': 0, 'In Industry employment': 1, 'Post graduate studies' : 2})
# Define X and y
y = processed_labels['GPA'].values
X = processed_features.values

# Add intercept term (bias)
X = np.hstack([np.ones((X.shape[0], 1)), X])  # Add column of 1s to X

In [5]:
def split_dataset(X, y, feature, threshold):
    left_idx = X[:, feature] <= threshold
    right_idx = X[:, feature] > threshold
    return X[left_idx], y[left_idx], X[right_idx], y[right_idx]

In [6]:
def mse(y):
    if len(y) == 0:
        return 0
    return np.mean((y - np.mean(y)) ** 2)

In [7]:
def best_split(X, y):
    best_feature, best_threshold, best_score = None, None, float('inf')
    n_features = X.shape[1]

    for feature in range(n_features):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_dataset(X, y, feature, threshold)
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            score = (len(y_left) * mse(y_left) + len(y_right) * mse(y_right)) / len(y)
            if score < best_score:
                best_score = score
                best_feature = feature
                best_threshold = threshold
    return best_feature, best_threshold

In [8]:
class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

def build_tree(X, y, max_depth, min_samples_split, depth=0):
    if depth >= max_depth or len(y) < min_samples_split or len(set(y)) == 1:
        return TreeNode(value=np.mean(y))

    feature, threshold = best_split(X, y)
    if feature is None:
        return TreeNode(value=np.mean(y))

    X_left, y_left, X_right, y_right = split_dataset(X, y, feature, threshold)
    left_node = build_tree(X_left, y_left, max_depth, min_samples_split, depth + 1)
    right_node = build_tree(X_right, y_right, max_depth, min_samples_split, depth + 1)

    return TreeNode(feature, threshold, left_node, right_node)

def predict_tree(tree, x):
    while tree.value is None:
        if x[tree.feature] <= tree.threshold:
            tree = tree.left
        else:
            tree = tree.right
    return tree.value

In [9]:
class RandomForestRegressorScratch:
    def __init__(self, n_trees=10, max_depth=5, min_samples_split=2, sample_ratio=0.8):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.sample_ratio = sample_ratio
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples = X.shape[0]
        for _ in range(self.n_trees):
            indices = np.random.choice(n_samples, int(self.sample_ratio * n_samples), replace=True)
            X_sample = X[indices]
            y_sample = y[indices]
            tree = build_tree(X_sample, y_sample, self.max_depth, self.min_samples_split)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([[predict_tree(tree, x) for tree in self.trees] for x in X])
        return np.mean(tree_preds, axis=1)

In [10]:
# Initialize model
model = RandomForestRegressorScratch(n_trees=5, max_depth=6)

# Train the model
model.fit(X, y)

# Predict on training data (or split it to evaluate properly)
predictions = model.predict(X)

In [11]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

rmse = root_mean_squared_error(y, predictions)
print(f"Training RMSE: {rmse:.4f}")

Training RMSE: 0.1739


In [12]:
print("predictions :",predictions)


predictions : [1.97921429 3.76055238 2.16365714 3.68509333 1.19061667 3.65271238
 3.27       2.20446154 1.362      1.74620741 2.36625714 2.68885714
 3.60202667 1.78484752 1.42510794 2.48186825 2.72769048 1.93266154
 1.79595    3.61408727 3.442      1.68430265 1.74915466 1.73042857
 3.47433333 3.61288727 1.85372963 3.054      1.64338598 1.56035043
 1.70334725 1.70458333 1.32675    2.14386667 3.37476    1.337
 3.56942061 3.17516    2.28685714 2.19152381 1.25213333 1.81119312
 1.211      3.68388571 1.16475    1.32306667 3.41776    3.73354632
 1.32675    1.20636667 2.64793557 1.64719312 1.71183614 3.35916
 1.35706667 3.095      1.1198     3.35516    3.24716    1.75058201
 1.42402222 1.5712328  3.60716    3.66337299 2.35333333 2.49075714
 2.62285714 3.62950632 1.24721667 1.50035    1.67980423 2.19602381
 3.53555394 2.3753     2.311      2.73176891 3.03541176 2.54613492
 2.974      1.76771154 3.40649333 2.16926154 1.58322222 3.53002667
 2.04302381 1.96166895 1.582      2.66026891 3.86151299 

In [13]:
y_class = processed_labels['Major'].values.flatten()  # this should be categorical (majors)
X_class = processed_features.values.astype(float)
print(set(type(label) for label in y_class))


{<class 'numpy.int64'>}


Calculate Gini Impurity ( How mixed up the classes are in the node )

In [14]:
def gini(y):
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return 1 - np.sum(probs ** 2)

In [15]:
def best_split_classification(X, y):
    best_feature, best_threshold, best_score = None, None, float('inf')
    n_features = X.shape[1]

    for feature in range(n_features):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            left_idx = X[:, feature] <= threshold
            right_idx = ~left_idx
            y_left, y_right = y[left_idx], y[right_idx]
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            score = (len(y_left) * gini(y_left) + len(y_right) * gini(y_right)) / len(y)
            if score < best_score:
                best_score = score
                best_feature = feature
                best_threshold = threshold
    return best_feature, best_threshold

In [16]:
class TreeNodeClassifier:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [17]:
from collections import Counter

def build_tree_classifier(X, y, max_depth, min_samples_split, depth=0):
    if depth >= max_depth or len(set(y)) == 1 or len(y) < min_samples_split:
        most_common = Counter(y).most_common(1)[0][0]
        return TreeNodeClassifier(value=most_common)

    feature, threshold = best_split_classification(X, y)
    if feature is None:
        most_common = Counter(y).most_common(1)[0][0]
        return TreeNodeClassifier(value=most_common)

    left_idx = X[:, feature] <= threshold
    right_idx = ~left_idx

    left = build_tree_classifier(X[left_idx], y[left_idx], max_depth, min_samples_split, depth+1)
    right = build_tree_classifier(X[right_idx], y[right_idx], max_depth, min_samples_split, depth+1)

    return TreeNodeClassifier(feature, threshold, left, right)

In [18]:
def predict_tree_classifier(tree, x):
    while tree.value is None:
        if x[tree.feature] <= tree.threshold:
            tree = tree.left
        else:
            tree = tree.right
    return tree.value

In [19]:
class RandomForestClassifierScratch:
    def __init__(self, n_trees=50, max_depth=6, min_samples_split=5, sample_ratio=0.75):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.sample_ratio = sample_ratio
        self.trees = []

    def fit(self, X, y):
        n_samples = X.shape[0]
        for _ in range(self.n_trees):
            idxs = np.random.choice(n_samples, int(n_samples * self.sample_ratio), replace=True)
            X_sample = X[idxs]
            y_sample = y[idxs]
            tree = build_tree_classifier(X_sample, y_sample, self.max_depth, self.min_samples_split)
            self.trees.append(tree)

    def predict(self, X):
        predictions = []
        for x in X:
            votes = [predict_tree_classifier(tree, x) for tree in self.trees]
            final_vote = Counter(votes).most_common(1)[0][0]
            predictions.append(final_vote)
        return np.array(predictions)

In [20]:
from sklearn.metrics import accuracy_score  # just for evaluation

# Assume y is categorical (major), maybe string
# If it's not encoded, you can use LabelEncoder to convert it

rf_classifier = RandomForestClassifierScratch(n_trees=50,max_depth=6,min_samples_split=5,sample_ratio=0.9)
rf_classifier.fit(X_class, y_class)

predictions_class = rf_classifier.predict(X_class)
predictions_class = np.rint(predictions_class).astype(float)
# Evaluate accuracy on training set (ideally split into test set too)

accuracy = np.mean(predictions_class == y_class)
print(f"Training Accuracy: {accuracy:.2%}")

Training Accuracy: 87.00%


In [21]:
print("predictions:",predictions_class)
predictions_class = np.rint(predictions_class).astype(int)
print(" yclass : ", y_class)

predictions: [0. 1. 0. 2. 1. 2. 2. 0. 0. 2. 0. 0. 0. 0. 0. 2. 3. 2. 2. 0. 1. 2. 0. 1.
 1. 1. 0. 0. 3. 0. 1. 2. 2. 2. 2. 2. 0. 2. 3. 2. 2. 1. 0. 2. 0. 0. 0. 1.
 2. 1. 0. 3. 0. 0. 2. 2. 2. 2. 2. 2. 3. 3. 0. 0. 2. 1. 1. 0. 0. 1. 2. 0.
 2. 2. 0. 0. 1. 2. 1. 0. 3. 3. 3. 3. 2. 2. 2. 3. 3. 0. 0. 1. 0. 1. 3. 1.
 0. 2. 0. 2. 0. 2. 2. 2. 1. 1. 2. 2. 1. 2. 0. 2. 0. 2. 0. 2. 2. 0. 2. 0.
 2. 2. 0. 2. 2. 1. 3. 2. 2. 0. 2. 1. 3. 1. 2. 2. 3. 2. 2. 1. 0. 0. 2. 0.
 3. 1. 0. 1. 2. 2. 3. 2. 2. 0. 1. 2. 2. 2. 2. 0. 2. 2. 0. 0. 0. 0. 2. 3.
 1. 0. 0. 2. 0. 0. 3. 2. 0. 3. 1. 1. 0. 2. 0. 3. 0. 3. 3. 3. 2. 0. 1. 1.
 2. 2. 3. 0. 0. 1. 2. 3.]
 yclass :  [0 1 0 1 1 2 2 0 0 2 0 0 0 0 0 2 3 2 2 0 1 2 0 1 1 1 3 0 3 0 1 2 2 2 2 2 1
 0 2 0 2 1 3 2 0 0 0 1 2 1 0 3 0 1 2 2 2 0 2 2 3 1 0 0 2 1 1 0 0 1 2 3 2 2
 0 0 1 2 1 0 3 3 0 3 2 0 2 0 3 0 0 1 0 1 3 1 0 2 3 3 0 2 3 2 1 1 2 2 3 2 0
 2 0 2 0 2 2 0 2 0 0 2 0 2 2 1 3 2 2 0 2 1 3 1 2 2 3 1 2 1 0 0 2 3 3 0 0 1
 2 2 3 2 2 0 1 2 2 2 2 0 2 2 2 0 0 0 2 3 1 0 0 0 0 0 3 2 0 3 1 1 0

Predicting Post Graduate Employment

In [22]:
y_class_combined = processed_labels['Post Graduate Career'].values.flatten()  # this should be categorical (majors)
print(processed_features.info)
df_to_append = pd.DataFrame ()
X_class_combined = processed_features.values.astype(float)


<bound method DataFrame.info of      SAT Score  High School GPA  Courses in Arts  Courses in STEM  \
0          594             2.10                0                2   
1         1100             3.81                1                1   
2          494             2.06                1                4   
3         1246             3.69                2                1   
4          726             1.22                1                0   
..         ...              ...              ...              ...   
195       1055             3.49                1                3   
196        476             1.67                1                3   
197        946             3.50                1                1   
198       1001             2.44                2                2   
199        460             2.56                0                3   

     Courses in Finance  Courses in Other  Age  Family Income  \
0                     2                 2   23         234000   
1        

In [23]:
from sklearn.metrics import accuracy_score  # just for evaluation

# Assume y is categorical (major), maybe string
# If it's not encoded, you can use LabelEncoder to convert it

rf_classifier_combined = RandomForestClassifierScratch(n_trees=50,max_depth=6,min_samples_split=5,sample_ratio=0.9)
rf_classifier_combined.fit(X_class_combined, y_class_combined)

predictions_class_combined = rf_classifier_combined.predict(X_class_combined)
predictions_class_combined = np.rint(predictions_class_combined).astype(float)
# Evaluate accuracy on training set (ideally split into test set too)

accuracy = np.mean(predictions_class_combined == y_class)
print(f"Training Accuracy: {accuracy:.2%}")

Training Accuracy: 21.50%


In [24]:
print("predictions:",predictions_class_combined)
predictions_class_combined = np.rint(predictions_class_combined).astype(int)
print(" yclass : ", y_class_combined)

predictions: [0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 2. 0. 0. 0. 1. 0. 0. 2. 1. 1. 1. 0.
 2. 2. 1. 2. 1. 0. 0. 0. 0. 0. 1. 1. 2. 1. 0. 0. 1. 0. 0. 2. 1. 1. 0. 0.
 1. 0. 2. 1. 1. 2. 0. 1. 1. 1. 0. 0. 2. 1. 2. 2. 1. 1. 1. 2. 1. 1. 1. 1.
 2. 1. 1. 1. 2. 1. 2. 0. 1. 0. 0. 2. 0. 0. 1. 1. 1. 1. 0. 1. 2. 1. 1. 1.
 2. 2. 1. 2. 2. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 2. 1. 2. 2.
 1. 0. 1. 0. 0. 0. 1. 2. 1. 2. 1. 1. 0. 0. 1. 0. 0. 0. 0. 2. 1. 2. 1. 0.
 1. 0. 1. 1. 1. 0. 1. 2. 1. 0. 1. 2. 0. 2. 0. 1. 0. 2. 1. 2. 0. 1. 1. 0.
 0. 0. 1. 2. 1. 1. 2. 2. 1. 1. 0. 1. 1. 1. 2. 2. 1. 0. 1. 1. 2. 0. 2. 2.
 0. 0. 0. 2. 0. 2. 1. 2.]
 yclass :  [0 0 0 1 0 0 1 1 1 0 1 1 2 0 0 0 1 0 0 2 1 1 1 0 2 2 1 2 1 0 0 0 0 0 1 1 2
 1 2 0 1 0 0 2 1 2 0 0 1 0 2 1 1 2 0 1 1 1 0 0 2 1 2 2 1 1 1 2 1 1 1 1 2 1
 1 1 2 1 2 0 1 0 0 2 0 0 1 1 1 1 0 1 1 1 1 1 2 2 1 2 2 0 0 1 0 0 1 1 1 1 1
 1 1 2 0 0 2 1 2 2 1 0 1 0 0 2 1 2 1 2 1 1 0 0 1 0 0 0 0 2 1 2 1 0 1 0 1 1
 1 0 1 2 2 0 1 2 0 2 0 1 0 2 1 2 0 1 0 0 0 0 2 0 1 1 2 2 1 1 0 1 1