In [43]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [44]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.
3,15.0,8,400,150,3761,10,1971,US.
4,30.5,4,98,63,2051,17,1978,US.


In [45]:
df.dtypes

mpg             float64
 cylinders        int64
 cubicinches     object
 hp               int64
 weightlbs       object
 time-to-60       int64
 year             int64
 brand           object
dtype: object

In [46]:
df[' cubicinches'] = pd.to_numeric(df[' cubicinches'], errors='coerce')
df[' weightlbs'] = pd.to_numeric(df[' weightlbs'], errors='coerce')

In [47]:
df.isna().sum()

mpg             0
 cylinders      0
 cubicinches    2
 hp             0
 weightlbs      3
 time-to-60     0
 year           0
 brand          0
dtype: int64

In [48]:
df.dropna(inplace=True)

In [49]:
df.duplicated().sum()

np.int64(0)

In [50]:
mappings = {}
for col in df.select_dtypes(include='object'):
    unique_labels = df[col].unique()
    value_to_label = {key: value for value, key in enumerate(unique_labels)}
    mappings[col] = value_to_label
    df[col] = df[col].map(value_to_label)

In [51]:
# mappings

In [52]:
# df.head()

In [53]:
x = df.drop(columns = ' brand')
y = df[' brand']

In [54]:
# x

In [55]:
np.random.seed(0)
train_size = int(len(x) * 0.8)
idx = np.random.permutation(len(x))

x_train, x_test = x.iloc[idx[:train_size]], x.iloc[idx[train_size:]]
y_train, y_test = y.iloc[idx[:train_size]], y.iloc[idx[train_size:]]

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(204, 7)
(52, 7)
(204,)
(52,)


In [56]:
# print(type(x_train))
# print(type(x_test))
# print(type(y_train))
# print(type(y_test))

In [57]:
# from sklearn.tree import DecisionTreeClassifier

In [58]:
# model = DecisionTreeClassifier(max_depth=5, criterion='gini', splitter='best')

In [59]:
# model.fit(x_train, y_train)

In [60]:
# y_pred = model.predict(x_test)

In [61]:
from collections import Counter

In [62]:
class Node:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

def gini_index(y):
    m = len(y)
    return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in np.unique(y))

def entropy(y):
    m = len(y)
    return -sum((np.sum(y == c) / m) * np.log2(np.sum(y == c) / m) for c in np.unique(y))

def grow_tree(X, y, depth=0, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion='gini'):
    num_samples_per_class = [np.sum(y == i) for i in np.unique(y)] #jumlah per kelas
    predicted_class = np.argmax(num_samples_per_class) #kelas yang paling banyak
    node = Node(
        gini=gini_index(y) if criterion == 'gini' else entropy(y),
        num_samples=len(y),
        num_samples_per_class=num_samples_per_class,
        predicted_class=predicted_class,
    )

    if depth < max_depth and len(y) >= min_samples_split:
        idx, thr = best_split(X, y, min_samples_leaf, criterion)
        if idx is not None:
            indices_left = X[:, idx] < thr
            X_left, y_left = X[indices_left], y[indices_left]
            X_right, y_right = X[~indices_left], y[~indices_left]
            node.feature_index = idx
            node.threshold = thr
            node.left = grow_tree(X_left, y_left, depth + 1, max_depth, min_samples_split, min_samples_leaf, criterion)
            node.right = grow_tree(X_right, y_right, depth + 1, max_depth, min_samples_split, min_samples_leaf, criterion)
    return node

def best_split(X, y, min_samples_leaf, criterion):
    m, n = X.shape
    if m <= 1:
        return None, None

    num_parent = [np.sum(y == c) for c in np.unique(y)]
    best_gini = 1.0 - sum((num / m) ** 2 for num in num_parent) if criterion == 'gini' else entropy(y)
    best_idx, best_thr = None, None

    for idx in range(n):
        thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
        num_left = Counter()
        num_right = Counter(num_parent)
        for i in range(1, m):
            c = classes[i - 1]
            num_left[c] += 1
            num_right[c] -= 1
            if i < min_samples_leaf or m - i < min_samples_leaf:
                continue
            gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in num_left) if criterion == 'gini' else entropy(list(num_left.elements()))
            gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in num_right) if criterion == 'gini' else entropy(list(num_right.elements()))
            gini = (i * gini_left + (m - i) * gini_right) / m

            if thresholds[i] == thresholds[i - 1]:
                continue

            if gini < best_gini:
                best_gini = gini
                best_idx = idx
                best_thr = (thresholds[i] + thresholds[i - 1]) / 2
    return best_idx, best_thr

In [63]:
class DecisionTreeClassifierFromScratch:
    def __init__(self, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion='gini'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = grow_tree(X, y, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, criterion=self.criterion)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _predict(self, inputs):
        node = self.tree
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

In [64]:
model = DecisionTreeClassifierFromScratch(max_depth=6, min_samples_split=5, min_samples_leaf=4, criterion='gini')

In [65]:
model.fit(x_train.values, y_train.values)

In [66]:
y_pred = model.predict(x_test.values)

In [67]:
y_pred

[np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(0)]

In [68]:
accuracy = np.mean(y_test == y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.4807692307692308


In [69]:
from customtkinter import *

In [86]:
x_test.head()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year
214,22.0,6,225.0,100,3233.0,15,1977
130,26.0,4,97.0,78,2300.0,15,1975
190,25.8,4,156.0,92,2620.0,14,1982
231,28.0,4,120.0,79,2625.0,19,1983
242,16.0,8,400.0,180,4220.0,11,1978


In [88]:
y_test

214    0
130    1
190    0
231    0
242    0
34     2
170    2
145    1
218    0
150    2
30     2
182    0
246    0
102    0
85     2
257    2
179    2
82     0
202    0
213    1
118    0
151    2
253    0
75     0
80     0
26     2
168    2
84     1
245    0
178    0
248    2
42     0
235    1
198    1
61     2
143    2
91     2
221    1
73     2
90     2
38     0
247    1
22     0
216    2
9      2
106    1
200    1
70     0
197    1
120    1
50     0
176    0
Name:  brand, dtype: int64

In [77]:
mappings[' brand']

{' US.': 0, ' Europe.': 1, ' Japan.': 2}

In [84]:
def gui_call():
    baseplate = CTk()
    baseplate.title('Decision tree GUI')
    set_appearance_mode('dark')
    baseplate.geometry('650x550')
    
    def answer():
        try:
            raw_data = np.array([
                float(entry_input1.get()),
                int(entry_input2.get()),
                float(entry_input3.get()),
                float(entry_input4.get()),
                float(entry_input5.get()),
                float(entry_input6.get()),
                int(entry_input7.get()),
            ]).reshape(1, -1)
            print(raw_data)
            predict = model.predict(raw_data)[0]
            pred_dict = {0 : 'US', 1 :' Europe.', 2:' Japan.'}
            print(pred_dict[predict])
            label_output.configure(text=f'Prediksi {pred_dict[predict]}', text_color='light green',font=('Arial',14))
            
        except Exception as e:
            print(f'Error occured {e}')
            label_output.configure(text=f'Error occured {e}', text_color='red',font=('Arial',14))
            
    
    label_title = CTkLabel(text='Car Country Checker',master=baseplate, font=('Arial', 14))
    label_title.place(relx=0.4,rely=0.1)
    
    label_input1 = CTkLabel(text='Masukkan nilai MPG',master=baseplate, font=('Arial', 12))
    label_input1.place(relx=0.05,rely=0.2)
    entry_input1 = CTkEntry(placeholder_text='Masukkan nilai MPG',master=baseplate, font=('Arial', 12), width=300)
    entry_input1.place(relx=0.30,rely=0.2) 
    
    label_input2 = CTkLabel(text='Masukkan jumlah cylinders',master=baseplate, font=('Arial', 12))
    label_input2.place(relx=0.05,rely=0.28)
    entry_input2 = CTkEntry(placeholder_text='Masukkan jumlah cylinders mobil',master=baseplate, font=('Arial', 12), width=300)
    entry_input2.place(relx=0.30,rely=0.28)
    
    label_input3 = CTkLabel(text='Masukkan jumlah in^3',master=baseplate, font=('Arial', 12))
    label_input3.place(relx=0.05,rely=0.36)
    entry_input3 = CTkEntry(placeholder_text='Masukkan jumlah in^3',master=baseplate, font=('Arial', 12), width=300)
    entry_input3.place(relx=0.30,rely=0.36)
    
    label_input4 = CTkLabel(text='Masukkan jumlah horsepower',master=baseplate, font=('Arial', 12))
    label_input4.place(relx=0.05,rely=0.44)
    entry_input4 = CTkEntry(placeholder_text='Masukkan jumlah horsepower',master=baseplate, font=('Arial', 12), width=300)
    entry_input4.place(relx=0.30,rely=0.44)
    
    label_input5 = CTkLabel(text='Masukkan berat mobil lbs',master=baseplate, font=('Arial', 12))
    label_input5.place(relx=0.05,rely=0.52)
    entry_input5 = CTkEntry(placeholder_text='Masukkan berat mobil lbs',master=baseplate, font=('Arial', 12), width=300)
    entry_input5.place(relx=0.30,rely=0.52)
    
    label_input6 = CTkLabel(text='Masukkan time to 60',master=baseplate, font=('Arial', 12))
    label_input6.place(relx=0.05,rely=0.60)
    entry_input6 = CTkEntry(placeholder_text='Masukkan time to 60',master=baseplate, font=('Arial', 12), width=300)
    entry_input6.place(relx=0.30,rely=0.60)
    
    label_input7 = CTkLabel(text='Masukkan tahun mobil dibuat',master=baseplate, font=('Arial', 12))
    label_input7.place(relx=0.05,rely=0.68)
    entry_input7 = CTkEntry(placeholder_text='Masukkan tahun mobil dibuat',master=baseplate, font=('Arial', 12), width=300)
    entry_input7.place(relx=0.30,rely=0.68)
    
    btn = CTkButton(text='Enter' ,master=baseplate, fg_color='green', command=answer)
    btn.place(relx=0.3,rely=0.76)
    
    label_output = CTkLabel(master=baseplate, text='', font=('Arial',14), text_color='red')
    label_output.place(relx=0.3,rely=0.85)

    baseplate.mainloop()

In [87]:
gui_call()

[[  22.    6.  225.  100. 3233.   15. 1977.]]
US
