# **Cài đặt cây quyết định sử dụng Gini**

In [None]:
import numpy as np
def gini_index(groups, classes):
    total_samples= float(sum([len(group) for group in groups]))
    gini=0.0
    for group in groups:
        size=float(len(group))
        if size==0:
            continue
        score = 0
        for class_val in classes:
            proportion= [row[-1] for row in group].count(class_val)/size
            score+=proportion**2
        gini+=(1-score)*(size/total_samples)
    return gini

def split_data(dataset,feature_index, threshold):
    left=[row for row in dataset if row[feature_index]<threshold]
    right=[row for row in dataset if row[feature_index] >= threshold]
    return left,right

In [None]:

# Test case

dataset=[
    [2.8,'Yes'],
    [1.2,'No'],
    [3.6,'Yes'],
    [4.5,'No'],
    [5.1, 'Yes']
]



groups=split_data(dataset,0,3.0)
classes=['Yes', 'No']
gini=gini_index(groups, classes)
print(f"Gini Index: {gini:.4f}")

Gini Index: 0.4667


# Xây dựng cây quyết định sử dụng OOP

##Lớp TreeNode - Biểu diễn một nút trong cây

In [None]:
class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, label=None):
        self.feature_index=feature_index
        self.threshold=threshold
        self.left=left
        self.right=right
        self.label=label

## Lớp DecisionTree - Xây dựng cây quyết định

In [None]:
import numpy as np
class DecisionTree:
    def __init__(self, max_depth=3):
        self.max_depth=max_depth
        self.root = None
    def gini_index(self, groups, classes):
        total_samples= float(sum([len(group) for group in groups]))
        gini=0.0
        for group in groups:
            size = len(group)
            if size==0:
                continue
            score = 0
            for class_val in classes:
                proportion= [row[-1] for row in group].count(class_val)/size
                score+=proportion**2
            gini+=(1-score)*(size/total_samples)
        return gini

    def spilit_data(self, dataset, feature_index, threshold):
        left=[row for row in dataset if row[feature_index]<threshold]
        right=[row for row in dataset if row[feature_index] >= threshold]
        return left,right

    def best_split(self, dataset):
        class_values=list(set(row[-1] for row in dataset))
        best_index, best_threshold, best_score, best_groups = None, None, float('inf'), None
        for index in range(len(dataset[0])-1):
            for row in dataset:
                groups=self.spilit_data(dataset, index, row[index])
                gini=self.gini_index(groups, class_values)
                if gini<best_score:
                    best_index, best_threshold, best_score, best_groups= index, row[index], gini, groups
        return best_index, best_threshold, best_groups

    def build_tree(self, dataset, depth=0):
        class_values=[row[-1] for row in dataset]
        if len(set(class_values))==1 or depth >= self.max_depth:
            return TreeNode(label=max(set(class_values), key=class_values.count))
        feature_index, threshold, (left, right)=self.best_split(dataset)
        if not left or not right:
            return TreeNode(label=max(set(class_values), key=class_values.count))
        left_node=self.build_tree(left, depth+1)
        right_node=self.build_tree(right, depth+1)
        return TreeNode(feature_index, threshold, left_node, right_node)

    def fit(self, dataset):
        self.root=self.build_tree(dataset)
    def print_tree(self, node=None, depth=0):
        if node is None:
            node=self.root
        if node.label is not None:
            print(f"{' ' * depth} [Leaf] Label: {node.label}")
        else:
            print(f"{' ' * depth} [Node] Feature: {node.feature_index} <= {node.threshold}")
            self.print_tree(node.left, depth+1)
            self.print_tree(node.right, depth+1)


In [None]:
#Test

dataset = [
 [2.8, 'Yes'],
 [1.2, 'No'],
 [3.6, 'Yes'],
 [4.5, 'No'],
 [5.1, 'Yes']
 ]

tree = DecisionTree(max_depth=3)
tree.fit(dataset)

print("Cây quyết định được xây dựng:")
tree.print_tree ()


Cây quyết định được xây dựng:
 [Node] Feature: 0 <= 2.8
  [Leaf] Label: No
  [Node] Feature: 0 <= 4.5
   [Leaf] Label: Yes
   [Node] Feature: 0 <= 5.1
    [Leaf] Label: No
    [Leaf] Label: Yes


# **Tính chỉ số Gini cho một tập dữ liệu**

In [None]:
import numpy as np
def gini_index(groups, classes):
    total_samples= float(sum([len(group) for group in groups]))
    gini=0.0
    for group in groups:
        size=float(len(group))
        if size==0:
            continue
        score=0
        for c in classes:
            proportion= [row[-1] for row in group].count(c)/size
            score+=proportion**2
        gini+=(1.0-score)*(size/total_samples)
    return gini


def split_data(dataset,feature_index, threshold):
    left=[row for row in dataset if row[feature_index]<threshold]
    right=[row for row in dataset if row[feature_index] >= threshold]
    return left,right

In [None]:
dataset=[
    [50, 'Yes'],
    [20, 'No'],
    [30, 'No'],
    [70, 'Yes'],
    [40, 'No'],
    [60, 'Yes'],

]

groups=split_data(dataset,0,0)   #Gini cho toàn bộ tập dữ liệu
classes=['Yes', 'No']
gini=gini_index(groups, classes)
print(f"Gini Index: {gini:.4f}")

Gini Index: 0.5000


In [None]:
dataset=[
    [50, 'Yes'],
    [20, 'No'],
    [30, 'No'],
    [70, 'Yes'],
    [40, 'No'],
    [60, 'Yes'],

]

groups=split_data(dataset,0,30)   #Gini có chia nhóm theo ngưỡng
classes=['Yes', 'No']
gini=gini_index(groups, classes)
print(f"Gini Index: {gini:.4f}")

Gini Index: 0.4000


# Mở rộng cây quyết định

In [None]:
import numpy as np

class TreeNode:
  def __init__(self, feature_index=None, threshold=None, left=None, right=None, label=None):
    self.feature_index = feature_index
    self.threshold = threshold
    self.left = left
    self.right = right
    self.label = label

class DecisionTree:
  def __init__(self, max_depth=3):
    self.max_depth = max_depth
    self.root = None


  def gini_index(self, groups, classes):
    total_samples = sum([len(group) for group in groups])
    gini = 0.0
    for group in groups:
      size = len(group)
      if size == 0:
        continue
      score = 0.0
      for class_val in classes:
        proportion = [row[-1] for row in group].count(class_val) / size
        score += proportion ** 2
      gini += (1.0- score) * (size / total_samples)
    return gini

  def split_data(self, dataset, feature_index, threshold):
    left = [row for row in dataset if row[feature_index] < threshold]
    right = [row for row in dataset if row[feature_index] >= threshold]
    return left, right

  def best_split(self, dataset):
    class_values = list(set(row[-1] for row in dataset))
    best_index, best_threshold, best_score, best_groups = None, None,float('inf'), None

    for index in range(len(dataset[0])- 1):
      for row in dataset:
        groups = self.split_data(dataset, index, row[index])
        gini = self.gini_index(groups, class_values)
        if gini < best_score:
          best_index, best_threshold, best_score, best_groups = index, row[index], gini, groups
    return best_index, best_threshold, best_groups

  def build_tree(self, dataset, depth=0):
    class_values = [row[-1] for row in dataset]
    if len(set(class_values)) == 1 or depth >= self.max_depth:
      return TreeNode(label=max(set(class_values), key=class_values.count))
    feature_index, threshold, (left, right) = self.best_split(dataset)
    if not left or not right:
      return TreeNode(label=max(set(class_values), key=class_values.count))
    left_node = self.build_tree(left, depth + 1)
    right_node = self.build_tree(right, depth + 1)
    return TreeNode(feature_index, threshold, left_node, right_node)

  def fit(self, dataset):
    self.root = self.build_tree(dataset)

  def print_tree(self, node=None, depth=0, prefix=""):
    if node is None:
        node = self.root
    if node.label is not None:
        print(f"{prefix}└── [Leaf] Label: {node.label}")
    else:
        print(f"{prefix}├── [Node] Feature {node.feature_index} <= {node.threshold}")
        new_prefix = prefix + "│   "
        self.print_tree(node.left, depth + 1, new_prefix)
        self.print_tree(node.right, depth + 1, new_prefix)


In [None]:
dataset2 = [[50, 30, 2.8, 'Yes'],
           [40, 70, 1.2, 'No'],
           [30, 50, 3.6, 'Yes'],
           [24, 10, 4.5, 'No'],
           [70, 25, 5.1, 'Yes']]

tree = DecisionTree()
tree.fit(dataset2)
tree.print_tree()

├── [Node] Feature 0 <= 50
│   ├── [Node] Feature 0 <= 40
│   │   ├── [Node] Feature 0 <= 30
│   │   │   └── [Leaf] Label: No
│   │   │   └── [Leaf] Label: Yes
│   │   └── [Leaf] Label: No
│   └── [Leaf] Label: Yes


In [None]:
dataset = [[2.8, 'Yes'],
           [1.2, 'No'],
           [3.6, 'Yes'],
           [4.5, 'No'],
           [5.1, 'Yes']]

tree = DecisionTree()
tree.fit(dataset)
tree.print_tree()

├── [Node] Feature 0 <= 2.8
│   └── [Leaf] Label: No
│   ├── [Node] Feature 0 <= 4.5
│   │   └── [Leaf] Label: Yes
│   │   ├── [Node] Feature 0 <= 5.1
│   │   │   └── [Leaf] Label: No
│   │   │   └── [Leaf] Label: Yes
