In [23]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from multiprocessing import Pool
from random import sample
from sklearn.metrics import classification_report
import time
np.seterr(divide='ignore', invalid='ignore')  # ignore Runtime Warning about divide


class TreeNode:
    def __init__(self, n_features):
        self.n_features = n_features
        self.left_child = None
        self.right_child = None
        self.split_feature = None
        self.split_value = None
        self.split_gini = 1
        self.label = None

    def is_leaf(self):
        return self.label is not None

    def gini(self, f, y, target):
        trans = f.reshape(len(f), -1)  
        a = np.concatenate((trans, target), axis=1)  
        a = a[a[:, 0].argsort()]  
        sort = a[:, 0]
        split = (sort[0:-1] + sort[1:]) / 2  

        left, right = np.array([split]), np.array([split])
        classes, counts = np.unique(y, return_counts=True)
        n_classes = len(classes)
        
        for i in range(n_classes):
            temp = a[:, -n_classes + i].cumsum()[:-1]
            left = np.vstack((left, temp))  
            right = np.vstack((right, counts[i] - temp))

        sum_1 = left[1:, :].sum(axis=0)  
        sum_2 = right[1:, :].sum(axis=0)
        n = len(split)
        gini_t1, gini_t2 = [1] * n, [1] * n
        
        for i in range(n_classes):
            gini_t1 -= (left[i + 1, :] / sum_1) ** 2
            gini_t2 -= (right[i + 1, :] / sum_2) ** 2
        s = sum(counts)
        g = gini_t1 * sum_1 / s + gini_t2 * sum_2 / s
        g = list(g)
        min_g = min(g)
        split_value = split[g.index(min_g)]
        return split_value, min_g

    def split_feature_value(self, x, y, target):
        
        n = x.shape[1]  
        sub_features = sample(range(n), self.n_features)  
        
        value_g = [self.gini(x[:, i], y, target) for i in sub_features]
        result = min(value_g, key=lambda t: t[1])  
        feature = sub_features[value_g.index(result)]  
        return feature, result[0], result[1]  

    
    def attempt_split(self, x, y, target):
        c = Counter(y)
        majority = c.most_common()[0]  
        label, count = majority[0], majority[1]
        if len(y) < 2 or len(c) == 1 or count/len(y) > 0.9:  
            self.label = label  
            return
        
        feature, value, split_gini = self.split_feature_value(x, y, target)
        
        if self.split_gini - split_gini < 0.01:  
            self.label = label  
            return
        index1 = x[:, feature] <= value
        index2 = x[:, feature] > value
        x1, y1, x2, y2 = x[index1], y[index1], x[index2], y[index2]
        target1, target2 = target[index1], target[index2]
        if len(y2) == 0 or len(y1) == 0:  
            self.label = label  
            return
        
        self.split_feature = feature
        self.split_value = value
        self.split_gini = split_gini
        self.left_child, self.right_child = TreeNode(self.n_features), TreeNode(self.n_features)
        self.left_child.split_gini, self.right_child.split_gini = split_gini, split_gini
        self.left_child.attempt_split(x1, y1, target1)
        self.right_child.attempt_split(x2, y2, target2)

    
    def sort(self, x):  
        if self.label is not None:
            return self.label
        if x[self.split_feature] <= self.split_value:
            return self.left_child.sort(x)
        else:
            return self.right_child.sort(x)


class ClassifierTree:
    def __init__(self, n_features):
        self.root = TreeNode(n_features)

    def train(self, x, y):
        encoder = OneHotEncoder(categories='auto')
        labels = y.reshape(len(y), -1) 
        target = encoder.fit_transform(labels).toarray()
        self.root.attempt_split(x, y, target)

    def classify(self, x):  # x is 2d array
        return [self.root.sort(x[i]) for i in range(x.shape[0])]


class RandomForest:
    def __init__(self, n_classifiers=30):
        self.n_classifiers = n_classifiers
        self.classifiers = []
        self.x = None
        self.y = None

    def build_tree(self, tree):
        n = len(self.y)  
        x, y = resample(self.x, self.y, n_samples=n)  
        tree.train(x, y)
        return tree  

    def fit(self, x, y):
        self.x, self.y = x, y
        n_select_features = int(np.sqrt(x.shape[1]))  
        for i in range(self.n_classifiers):
            tree = ClassifierTree(n_select_features)
            self.classifiers.append(tree)
        
        pool = Pool()
        self.classifiers = pool.map(self.build_tree, self.classifiers)
        pool.close()
        pool.join()

    def predict(self, x_test):  
        pred = [tree.classify(x_test) for tree in self.classifiers]
        pred = np.array(pred)
        result = [Counter(pred[:, i]).most_common()[0][0] for i in range(pred.shape[1])]
        return result



In [3]:
def read_csv(path):
    marks_df = pd.read_csv(path)
    return marks_df

In [4]:
data = read_csv("datasets/clean_tmdb.csv")

X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X = X.to_numpy()
y = y.to_numpy()

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
model = RandomForest(n_classifiers=50)
model.fit(X_train, y_train)

predicted_classes = model.predict(X_train)
train_accuracy = accuracy_score(predicted_classes, y_train.flatten())
train_report = classification_report(predicted_classes, y_train.flatten())

predicted_classes = model.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test.flatten())
test_report = classification_report(predicted_classes, y_test.flatten())

print("My rf:")
print("\nTrain:\n\naccuracy: {}".format(train_accuracy))
print("report:")
print(train_report)
print('*'*55)
print("\nTest:\n\naccuracy: {}".format(test_accuracy))
print("report:")
print(test_report)


My rf:

Train:

accuracy: 0.6541330018645121
report:
              precision    recall  f1-score   support

           0       0.33      0.71      0.45       632
           1       0.90      0.64      0.75      2586

    accuracy                           0.65      3218
   macro avg       0.61      0.68      0.60      3218
weighted avg       0.79      0.65      0.69      3218

*******************************************************

Test:

accuracy: 0.661198738170347
report:
              precision    recall  f1-score   support

           0       0.32      0.72      0.44       299
           1       0.91      0.65      0.76      1286

    accuracy                           0.66      1585
   macro avg       0.61      0.68      0.60      1585
weighted avg       0.80      0.66      0.70      1585



In [24]:
# Using scikit-learn
model = RandomForestClassifier(n_estimators=50, max_depth=2)
model.fit(X_train, y_train)


predicted_classes = model.predict(X_train)
train_accuracy = accuracy_score(predicted_classes, y_train.flatten())
train_report = classification_report(predicted_classes, y_train.flatten())

predicted_classes = model.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test.flatten())
test_report = classification_report(predicted_classes, y_test.flatten())

print("Scikit-learn:")
print("\nTrain:\n\naccuracy: {}".format(train_accuracy))
print("report:")
print(train_report)
print('*'*55)
print("\nTest:\n\naccuracy: {}".format(test_accuracy))
print("report:")
print(test_report)

Scikit-learn:

Train:

accuracy: 0.6991920447482909
report:
              precision    recall  f1-score   support

           0       0.53      0.70      0.60      1047
           1       0.83      0.70      0.76      2171

    accuracy                           0.70      3218
   macro avg       0.68      0.70      0.68      3218
weighted avg       0.73      0.70      0.71      3218

*******************************************************

Test:

accuracy: 0.7041009463722397
report:
              precision    recall  f1-score   support

           0       0.51      0.71      0.59       479
           1       0.85      0.70      0.77      1106

    accuracy                           0.70      1585
   macro avg       0.68      0.70      0.68      1585
weighted avg       0.74      0.70      0.71      1585

