In [46]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import classification_report

class Node:
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None


class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        for idx in range(self.n_features_):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

In [40]:
def read_csv(path):
    marks_df = pd.read_csv(path)
    return marks_df

In [41]:
data = read_csv("datasets/clean_tmdb.csv")

X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X = X.to_numpy()
y = y.to_numpy()

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [52]:
model = DecisionTreeClassifier(max_depth=5)
model.fit(X_train, y_train)

predicted_classes = model.predict(X_train)
train_accuracy = accuracy_score(predicted_classes, y_train.flatten())
train_report = classification_report(predicted_classes, y_train.flatten())

predicted_classes = model.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test.flatten())
test_report = classification_report(predicted_classes, y_test.flatten())

print("My ds:")
print("\nTrain:\n\naccuracy: {}".format(train_accuracy))
print("report:")
print(train_report)
print('*'*55)
print("\nTest:\n\naccuracy: {}".format(test_accuracy))
print("report:")
print(test_report)

My ds:

Train:

accuracy: 0.7454940957116222
report:
              precision    recall  f1-score   support

           0       0.77      0.68      0.72      1556
           1       0.73      0.81      0.77      1662

    accuracy                           0.75      3218
   macro avg       0.75      0.74      0.74      3218
weighted avg       0.75      0.75      0.74      3218

*******************************************************

Test:

accuracy: 0.7299684542586751
report:
              precision    recall  f1-score   support

           0       0.75      0.66      0.70       760
           1       0.72      0.80      0.75       825

    accuracy                           0.73      1585
   macro avg       0.73      0.73      0.73      1585
weighted avg       0.73      0.73      0.73      1585



In [53]:
# Using scikit-learn
model = tree.DecisionTreeClassifier(max_depth=5)
model.fit(X_train, y_train)

predicted_classes = model.predict(X_train)
train_accuracy = accuracy_score(predicted_classes, y_train.flatten())
train_report = classification_report(predicted_classes, y_train.flatten())

predicted_classes = model.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test.flatten())
test_report = classification_report(predicted_classes, y_test.flatten())

print("Scikit-learn:")
print("\nTrain:\n\naccuracy: {}".format(train_accuracy))
print("report:")
print(train_report)
print('*'*55)
print("\nTest:\n\naccuracy: {}".format(test_accuracy))
print("report:")
print(test_report)

Scikit-learn:

Train:

accuracy: 0.7454940957116222
report:
              precision    recall  f1-score   support

           0       0.77      0.68      0.72      1556
           1       0.73      0.81      0.77      1662

    accuracy                           0.75      3218
   macro avg       0.75      0.74      0.74      3218
weighted avg       0.75      0.75      0.74      3218

*******************************************************

Test:

accuracy: 0.7305993690851735
report:
              precision    recall  f1-score   support

           0       0.75      0.66      0.70       761
           1       0.72      0.80      0.76       824

    accuracy                           0.73      1585
   macro avg       0.73      0.73      0.73      1585
weighted avg       0.73      0.73      0.73      1585

