In [15]:
import pandas as pd
from graphviz import Digraph

In [16]:
column_names =  ['RBC', 'HGB','HCT', 'MCV', 'MCH', 'MCHC']
Target_name = 'SONUÇ'

veri = pd.read_excel('yeniEğitim3.xlsx')
test_data = pd.read_excel('yeniTest3.xlsx') 

SONUÇ_column = veri[Target_name]
veri.drop(columns=['Record'], inplace=True)


print(veri)


       RBC    HGB    HCT    MCV    MCH   MCHC SONUÇ
0     2.60   3.60  15.70  59.20  13.60  22.90   yes
1     2.83   4.72  24.32  85.92  26.10  30.52   yes
2     4.61   6.62  24.22  52.52  14.32  27.32   yes
3     3.90   7.02  24.82  63.62  17.92  28.22   yes
4     2.07   7.32  22.92  10.61  35.30  31.90   yes
...    ...    ...    ...    ...    ...    ...   ...
1728  5.27  16.00  46.50  88.20  30.40  34.40    no
1729  5.10  16.00  45.50  89.70  31.60  35.20    no
1730  5.30  16.10  46.70  88.10  30.40  34.50    no
1731  5.60  16.30  46.20  81.90  28.90  35.30    no
1732  5.20  16.50  46.80  89.50  31.50  35.30    no

[1733 rows x 7 columns]


In [17]:
def calculate_column_means(data_dict, column_names): # Sütun ortalamaları.
    column_means = {}
    for attribute in column_names:
        if attribute in data_dict:
            values = data_dict[attribute]
           
            mean = sum([float(value) for value in values]) / len(values)
            column_means[attribute] = mean
       
    return column_means

column_means = calculate_column_means(veri,column_names)
print(column_means)

{'RBC': 4.485683785343335, 'HGB': 12.450109636468552, 'HCT': 37.98213502596653, 'MCV': 84.62094633583382, 'MCH': 27.836497403346797, 'MCHC': 32.70079630698211}


In [18]:
def calculate_gini(data): # Verilen verinin Gini'sini hesaplar
    total_samples = len(data)
    yes_count = 0
    no_count = 0
    
    if total_samples == 0:
        return 0.0, yes_count + no_count  # Veri yoksa 0 olarak döner
    
    gini = 1.0 
    
    for _, row in data.iterrows(): # Yes ve No sayılarını belirliyor
        class_label = row[Target_name]  # 'SONUÇ' sütunu üzerinden Yes / No değerini alıyor
        if class_label == 'yes':
            yes_count += 1
        elif class_label == 'no':
            no_count += 1
    
    class_prob_yes = yes_count / total_samples
    class_prob_no = no_count / total_samples
    gini -= class_prob_yes ** 2 + class_prob_no ** 2
    
    return gini, yes_count + no_count


def gini_impurity(data, threshold, column):  # Sol ve Sağ verileri ayırır
    sol_veri = data[data[column] <= threshold]
    sag_veri = data[data[column] > threshold]
   

    gini_sol,sol_yes_no = calculate_gini(sol_veri)
    gini_sag,sag_yes_no = calculate_gini(sag_veri)

    return gini_sol, gini_sag, sol_yes_no, sag_yes_no

def calculate_combined_gini(gini_sol, gini_sag, sol_samples, sag_samples):
    total_samples = sol_samples + sag_samples
    combined_gini = (sol_samples / total_samples) * gini_sol + (sag_samples / total_samples) * gini_sag
    return combined_gini

    

In [19]:
def TurGerekliColumnHesap(yVeri): # Verilen veri setinden min Gini ve ortalamasını döndürüyor.
    gini_degerleri = {}
    ortalamalar = {}
    ortalamalar = calculate_column_means(yVeri ,column_names)
    for column_name in column_names:
        left_g, right_g, sol_yesNo, sag_yesNo = gini_impurity(yVeri, ortalamalar[column_name], column_name)
        GiniDegeri = calculate_combined_gini(left_g, right_g, sol_yesNo, sag_yesNo)
        gini_degerleri[column_name] = GiniDegeri 
    
    min_gini_column = min(gini_degerleri, key=gini_degerleri.get)
    min_gini_column_ortalama = ortalamalar[min_gini_column]
    return min_gini_column,min_gini_column_ortalama

TurGerekliColumnHesap(veri)


('HGB', 12.450109636468552)

In [21]:
class Node: 
    def __init__(self, column=None, threshold=None, left=None, right=None, label=None):
        self.column = column  # Kullanılacak sütun adı
        self.threshold = threshold  # Eşik (ortalama) değer
        self.left = left  # Sol dal
        self.right = right  # Sağ dal
        self.label = label  # Yaprak düğüm için sınıf etiketi ( Yes / No )

    def is_leaf(self):
        return self.label is not None


In [22]:
import random

def build_tree(data, dot=None, node_ids=None):
    # En küçük Gini değerine sahip sütunu ve ortalama değeri al
    min_gini_column, ortalama_deger = TurGerekliColumnHesap(data)

    if dot is None:
        dot = Digraph(comment='Decision Tree')

    if node_ids is None:
        node_ids = {}

    # node'a atanacak ID'yi oluşturuluyor 
    node_id = f'{min_gini_column}_{ortalama_deger:.2f}_{random.randint(1, 100)}'

    # Kök düğümü oluştur
    root_node = Node(column=min_gini_column, threshold=ortalama_deger)
    node_ids[root_node] = node_id

    if len(set(data[Target_name])) > 1:
        label = f'{min_gini_column}'
        dot.node(node_id, label)
        
    # Sol dalı oluştur
    left_data = data[data[min_gini_column] <= ortalama_deger]

    if len(set(left_data[Target_name])) > 1:
        root_node.left, dot, node_ids = build_tree(left_data, dot=dot, node_ids=node_ids)
        dot.edge(node_id, node_ids[root_node.left], label=f'<= {ortalama_deger:.2f}')
    else:
        left_label = left_data[Target_name].values[0]
        left_node_id = f'{left_label}_{ortalama_deger:.2f}_left_{random.randint(1, 100)}'  # Yaprak düğümünün id'sine '_left' ekle
        root_node.left = Node(column=min_gini_column, threshold=ortalama_deger,label =left_label)
        
        dot.node(left_node_id, left_label)  # Yaprak düğümünün label'ını ayrıca ayarla
        dot.edge(node_id, left_node_id, label=f'<= {ortalama_deger:.2f}')
        node_ids[root_node.left] = left_node_id

    # Sağ dalı oluştur
    right_data = data[data[min_gini_column] > ortalama_deger]

    if len(set(right_data[Target_name])) > 1:
        root_node.right, dot, node_ids = build_tree(right_data, dot=dot, node_ids=node_ids)
        dot.edge(node_id, node_ids[root_node.right], label=f'> {ortalama_deger:.2f}')
    else:
        right_label = right_data[Target_name].values[0]
        right_node_id = f'{right_label}_{ortalama_deger:.2f}_right_{random.randint(1, 100)}'  # Yaprak düğümünün id'sine '_right' ekle
        root_node.right = Node(column=min_gini_column, threshold=ortalama_deger,label =right_label)
        
        dot.node(right_node_id, right_label)  # Yaprak düğümünün label'ını ayrıca ayarla
        dot.edge(node_id, right_node_id, label=f'> {ortalama_deger:.2f}')
        node_ids[root_node.right] = right_node_id

    return root_node, dot, node_ids

tree, dot, _ = build_tree(veri)

dot.render('binary_tree', view=True)


'binary_tree.pdf'

In [23]:
def predict_tree(node, data_point):
    while not node.is_leaf():
        if data_point[node.column] <= node.threshold:
            node = node.left
        else:
            node = node.right
    return node.label

def evaluate_tree(tree, test_data):
    correct_predictions = 0
    incorrect_predictions = 0

    for _, row in test_data.iterrows():
        predicted_label = predict_tree(tree, row)
        true_label = row[Target_name]

        if predicted_label == true_label:
            correct_predictions += 1
        else:
            incorrect_predictions += 1
            

    return correct_predictions, incorrect_predictions



correct_predictions, incorrect_predictions = evaluate_tree(tree, test_data)

print(f"Correct Predictions: {correct_predictions}")
print(f"Incorrect Predictions: {incorrect_predictions}")


Correct Predictions: 854
Incorrect Predictions: 12
