In [1]:
import pandas as pd
import math
from collections import Counter

In [9]:
def entropy(data):
    label_counts = Counter(data["class"])
    total_samples = len(data)
    return -sum((count/total_samples) * math.log2(count/total_samples) for count in label_counts.values())
def info_gain(data, feature):
    total_entropy = entropy(data)
    values = data[feature].unique()
    weighted_entropy = sum((len(subset)/len(data)) * entropy(subset) for value in values if (subset := data[data[feature] == value]).any().any())
    return total_entropy - weighted_entropy

In [15]:
data=pd.DataFrame(pd.read_csv('adult2.csv'))
columns=["Age", "Workclass", "Education", "Marital-Status", "Occupation", "Relationship", "Race", "Sex", "Class"]
print(data)


    age          workclass  fnlwgt   education  education-num  \
0    39          State-gov   77516   Bachelors             13   
1    50   Self-emp-not-inc   83311   Bachelors             13   
2    38            Private  215646     HS-grad              9   
3    53            Private  234721        11th              7   
4    28            Private  338409   Bachelors             13   
..  ...                ...     ...         ...            ...   
79   31          Local-gov  125927     7th-8th              4   
80   18            Private  446839     HS-grad              9   
81   52            Private  276515   Bachelors             13   
82   46            Private   51618     HS-grad              9   
83   59            Private  159937     HS-grad              9   

         marital-status          occupation    relationship    race      sex  \
0         Never-married        Adm-clerical   Not-in-family   White     Male   
1    Married-civ-spouse     Exec-managerial         Husband

In [16]:
def id3(data, features):
    if len(set(data["class"])) == 1:
        return data["class"].iloc[0]
    if not features:
        return Counter(data["class"]).most_common(1)[0][0]

    best_feature = max(features, key=lambda feature: info_gain(data, feature))
    tree = {best_feature: {}}
    features = [f for f in features if f != best_feature]  # Remove used feature

    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        tree[best_feature][value] = id3(subset, features)  # Backtracking step

    return tree
# Train the ID3 decision tree
features = list(data.columns[:-1])  # All columns except "Class"
decision_tree = id3(data, features)

# Function to classify a new instance
def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree  # If it's a leaf node, return the class
    feature = next(iter(tree))  # Get the first feature in the tree
    value = sample.get(feature)
    return classify(tree[feature].get(value, "Unknown"), sample)  # Recursively classify

# Print the trained decision tree
print("Decision Tree:", decision_tree)


Decision Tree: {'fnlwgt': {77516: ' <=50K', 83311: ' <=50K', 215646: ' <=50K', 234721: ' <=50K', 338409: ' <=50K', 284582: ' <=50K', 160187: ' <=50K', 209642: ' >50K', 45781: ' >50K', 159449: ' >50K', 280464: ' >50K', 141297: ' >50K', 122272: ' <=50K', 205019: ' <=50K', 121772: ' >50K', 245487: ' <=50K', 176756: ' <=50K', 186824: ' <=50K', 28887: ' <=50K', 292175: ' >50K', 193524: ' >50K', 302146: ' <=50K', 76845: ' <=50K', 117037: ' <=50K', 109015: ' <=50K', 216851: ' >50K', 168294: ' <=50K', 180211: ' >50K', 367260: ' <=50K', 193366: ' <=50K', 190709: ' <=50K', 266015: ' <=50K', 386940: ' <=50K', 59951: ' <=50K', 311512: ' <=50K', 242406: ' <=50K', 197200: ' <=50K', 544091: ' <=50K', 84154: ' >50K', 265477: ' <=50K', 507875: ' <=50K', 88506: ' <=50K', 172987: ' <=50K', 94638: ' <=50K', 289980: ' <=50K', 337895: ' >50K', 144361: ' <=50K', 128354: ' <=50K', 101603: ' <=50K', 271466: ' <=50K', 32275: ' <=50K', 226956: ' <=50K', 51835: ' >50K', 251585: ' >50K', 109832: ' <=50K', 237993: 

In [23]:
def entropy(data):
    label_counts = Counter(data["class"])
    total_samples = len(data)
    return -sum((count / total_samples) * math.log2(count / total_samples) for count in label_counts.values())

# Function to calculate information gain
def info_gain(data, feature):
    total_entropy = entropy(data)
    values = data[feature].unique()
    weighted_entropy = 0

    for value in values:
        subset = data[data[feature] == value]  # Now defined correctly
        if len(subset) > 0:
            weighted_entropy += (len(subset) / len(data)) * entropy(subset)
    
    return total_entropy - weighted_entropy

# Function to build decision tree using ID3
def id3(data, features):
    # Base case: if all samples belong to one class, return that class
    if len(set(data["class"])) == 1:
        return data["class"].iloc[0]
    # Base case: if no features left, return the most common class
    if not features:
        return Counter(data["class"]).most_common(1)[0][0]

    # Find the best feature to split on
    best_feature = max(features, key=lambda feature: info_gain(data, feature))
    tree = {best_feature: {}}
    remaining_features = [f for f in features if f != best_feature]  # Remove used feature

    # Recursively build the tree
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]  # Define subset outside
        tree[best_feature][value] = id3(subset, remaining_features)  # Backtracking step

    return tree

# Train the ID3 decision tree (keeping "fnlwgt" in DataFrame but excluding it from features)
features = [col for col in data.columns if col not in ["Class", "fnlwgt"]]  # Exclude "Class" and "fnlwgt"
decision_tree = id3(data, features)

# Function to print the tree in structured format
def print_tree(tree, depth=0):
    if not isinstance(tree, dict):  # If it's a leaf node, print the decision
        print("  " * depth + f"--> {tree}")
        return
    feature = next(iter(tree))  # Get the first (root) feature
    print("  " * depth + f"[{feature}]")  # Print feature name
    for value, subtree in tree[feature].items():
        print("  " * (depth + 1) + f"({value})")
        print_tree(subtree, depth + 2)  # Recursively print sub-nodes

# Print the structured decision tree
print_tree(decision_tree)


[class]
  ( <=50K)
    -->  <=50K
  ( >50K)
    -->  >50K


In [24]:
test_sample = {"Age": 37, "Workclass": "Private", "Education": "Masters", "Marital-Status": "Married-civ-spouse", "Occupation": "Exec-managerial", "Relationship": "Wife", "Race": "White", "Sex": "Female"}
print("Predicted Class:", classify(decision_tree, test_sample))

Predicted Class: Unknown


In [4]:
import pandas as pd
from collections import Counter
import math

class ID3TreeClassifier:

    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def _entropy(self, y):
        counts = Counter(y)
        total = len(y)
        entropy = 0
        for count in counts.values():
            if total > 0:
                probability = count / total
                if probability > 0 and probability < 1: # Check for probability to avoid math domain error
                    entropy -= probability * math.log(probability, 2)
        return entropy

    def _information_gain(self, X, y, attribute):
        parent_entropy = self._entropy(y)
        values = X[attribute].unique()
        weighted_child_entropy = 0
        for value in values:
            subset_y = y[X[attribute] == value]
            subset_size = len(subset_y)
            if subset_size > 0:  # Check to avoid division by zero
                probability = subset_size / len(y)
                weighted_child_entropy += probability * self._entropy(subset_y)
        information_gain = parent_entropy - weighted_child_entropy
        return information_gain

    def _best_attribute(self, X, y):
        best_gain = -1
        best_attribute = None
        for attribute in X.columns:
            gain = self._information_gain(X, y, attribute)
            if gain > best_gain:
                best_gain = gain
                best_attribute = attribute
        return best_attribute

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(y.unique()) == 1 or len(X) == 0:  #added a condition to stop if all examples have same class or X is empty
            return Counter(y).most_common(1)  # Return majority class

        best_attribute = self._best_attribute(X, y)

        if best_attribute is None: # Handle cases where information gain is zero for all attributes
            return Counter(y).most_common(1)

        tree = {best_attribute: {}}

        for value in X[best_attribute].unique():
            subset_X = X[X[best_attribute] == value]
            subset_y = y[X[best_attribute] == value]

            # Drop the current attribute to avoid infinite loops
            subset_X = subset_X.drop(columns=[best_attribute])

            tree[best_attribute][value] = self._build_tree(subset_X, subset_y, depth + 1)
        return tree

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, 0)

    def _predict_one(self, x, tree):
        if not isinstance(tree, dict): # if leaf node
            return tree
        attribute = list(tree.keys())
        value = x[attribute]
        if value in tree[attribute]: # Handling unseen values
            subtree = tree[attribute][value]
            return self._predict_one(x, subtree)
        else:
            # Handle unseen values by returning the majority class of the parent node
            parent_counts = Counter()
            for val in tree[attribute].keys():
                parent_counts.update({tree[attribute][val]: sum(1 for v in X[attribute] if v == val) })
            return parent_counts.most_common(1) if parent_counts else Counter(y).most_common(1) # Return majority of parent or entire dataset if parent is empty


    def predict(self, X):
        predictions =""
        for _, x in X.iterrows():
            predictions.append(self._predict_one(x, self.tree))
        return predictions


# Load your data (replace 'your_data.csv' with the actual file name)
data = """age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners,28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K
37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K
49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K
31, Private, 45781, Masters, 14, Never-married, Prof-specialty, Not-in-family, White, Female, 14084, 0, 50, United-States, >50K
42, Private, 159449, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 5178, 0, 40, United-States, >50K
37, Private, 280464, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, Black, Male, 0, 0, 80, United-States, >50K"""
data = pd.read_csv('adult2.csv')


X = data.drop(columns=['class','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']) # Exclude int columns and target
y = data['class']

classifier = ID3TreeClassifier(max_depth=5)  # You can adjust max_depth
classifier.fit(X, y)

X_test = X.iloc[[0,1,5]] # Example test data
predictions = classifier.predict(X_test)
print(predictions)

AttributeError: 'str' object has no attribute 'append'

In [1]:
import csv
import math

# Function to load CSV data
def load_csv(filename):
    dataset = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        headers = next(reader)  # Read the header
        for row in reader:
            dataset.append(row)
    return headers, dataset

# Function to calculate entropy
def calculate_entropy(data, target_index):
    target_values = [row[target_index] for row in data]
    unique_classes = set(target_values)
    entropy = 0
    for cls in unique_classes:
        prob = target_values.count(cls) / len(target_values)
        entropy -= prob * math.log2(prob)
    return entropy

# Function to calculate information gain
def information_gain(data, attr_index, target_index):
    total_entropy = calculate_entropy(data, target_index)
    values = set(row[attr_index] for row in data)
    weighted_entropy = 0

    for value in values:
        subset = [row for row in data if row[attr_index] == value]
        prob = len(subset) / len(data)
        weighted_entropy += prob * calculate_entropy(subset, target_index)

    return total_entropy - weighted_entropy

# Function to find the best attribute for splitting
def best_attribute(data, attributes, target_index):
    best_gain = -1
    best_attr = None

    for attr in attributes:
        gain = information_gain(data, attr, target_index)
        if gain > best_gain:
            best_gain = gain
            best_attr = attr

    return best_attr

# Function to build the decision tree
def build_tree(data, attributes, target_index):
    target_values = [row[target_index] for row in data]

    # If all examples have the same class, return that class
    if len(set(target_values)) == 1:
        return target_values[0]

    # If no attributes left, return the most common class
    if not attributes:
        return max(set(target_values), key=target_values.count)

    # Find the best attribute to split
    best_attr = best_attribute(data, attributes, target_index)
    tree = {best_attr: {}}

    # Get unique values of the best attribute
    attr_values = set(row[best_attr] for row in data)

    # Split dataset and recursively build subtrees
    for value in attr_values:
        subset = [row for row in data if row[best_attr] == value]
        new_attributes = attributes[:]
        new_attributes.remove(best_attr)
        tree[best_attr][value] = build_tree(subset, new_attributes, target_index)

    return tree

# Function to print the tree structure
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        for key, value in tree.items():
            print(f"{indent}{key}:")
            print_tree(value, indent + "  ")
    else:
        print(f"{indent}-> {tree}")

# Load dataset
filename = "modified_dataset2.csv"
headers, dataset = load_csv(filename)

# Convert categorical values into numbers
for col in range(len(headers)):
    unique_values = list(set(row[col] for row in dataset))
    for row in dataset:
        row[col] = unique_values.index(row[col])  # Encode category as index

# Define attributes (excluding the target column)
target_index = len(headers) - 1
attributes = list(range(len(headers) - 1))

# Build the decision tree
decision_tree = build_tree(dataset, attributes, target_index)

# Print the decision tree
print("Decision Tree:")
print_tree(decision_tree)


Decision Tree:
6:
  0:
    5:
      0:
        -> 1
      1:
        7:
          0:
            -> 1
          1:
            4:
              1:
                -> 1
              3:
                2:
                  0:
                    -> 1
                  1:
                    -> 1
                  5:
                    0:
                      0:
                        -> 1
                      1:
                        -> 0
                  6:
                    -> 1
                  7:
                    -> 1
                  11:
                    -> 1
              4:
                -> 1
              5:
                2:
                  0:
                    -> 1
                  1:
                    -> 1
                  2:
                    -> 1
                  5:
                    -> 1
                  7:
                    1:
                      1:
                        0:
                          0:
                            3:

In [2]:
import csv
import math

# Function to load CSV data
def load_csv(filename):
    dataset = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        headers = next(reader)  # Read the header
        for row in reader:
            dataset.append(row)
    return headers, dataset

# Function to calculate entropy
def calculate_entropy(data, target_index):
    target_values = [row[target_index] for row in data]
    unique_classes = set(target_values)
    entropy = 0
    for cls in unique_classes:
        prob = target_values.count(cls) / len(target_values)
        entropy -= prob * math.log2(prob)
    return entropy

# Function to calculate information gain
def information_gain(data, attr_index, target_index):
    total_entropy = calculate_entropy(data, target_index)
    values = set(row[attr_index] for row in data)
    weighted_entropy = 0

    for value in values:
        subset = [row for row in data if row[attr_index] == value]
        prob = len(subset) / len(data)
        weighted_entropy += prob * calculate_entropy(subset, target_index)

    return total_entropy - weighted_entropy

# Function to find the best attribute for splitting
def best_attribute(data, attributes, target_index):
    best_gain = -1
    best_attr = None

    for attr in attributes:
        gain = information_gain(data, attr, target_index)
        if gain > best_gain:
            best_gain = gain
            best_attr = attr

    return best_attr

# Function to build the decision tree
def build_tree(data, attributes, target_index):
    target_values = [row[target_index] for row in data]

    # If all examples have the same class, return that class
    if len(set(target_values)) == 1:
        return target_values[0]

    # If no attributes left, return the most common class
    if not attributes:
        return max(set(target_values), key=target_values.count)

    # Find the best attribute to split
    best_attr = best_attribute(data, attributes, target_index)
    tree = {best_attr: {}}

    # Get unique values of the best attribute
    attr_values = set(row[best_attr] for row in data)

    # Split dataset and recursively build subtrees
    for value in attr_values:
        subset = [row for row in data if row[best_attr] == value]
        new_attributes = attributes[:]
        new_attributes.remove(best_attr)
        tree[best_attr][value] = build_tree(subset, new_attributes, target_index)

    return tree

# Function to print the tree structure with readable labels
def print_tree(tree, headers, indent=""):
    if isinstance(tree, dict):
        for key, value in tree.items():
            print(f"{indent}{headers[key]}:")
            print_tree(value, headers, indent + "  ")
    else:
        print(f"{indent}-> {tree}")

# Load dataset
filename = "modified_dataset2.csv"
headers, dataset = load_csv(filename)

# Convert categorical values into numerical labels and store mapping
attribute_mappings = {}
for col in range(len(headers)):
    unique_values = list(set(row[col] for row in dataset))
    attribute_mappings[col] = {v: i for i, v in enumerate(unique_values)}
    
    # Replace categories with numerical labels
    for row in dataset:
        row[col] = attribute_mappings[col][row[col]]

# Define attributes (excluding the target column)
target_index = len(headers) - 1
attributes = list(range(len(headers) - 1))

# Build the decision tree
decision_tree = build_tree(dataset, attributes, target_index)

# Print the decision tree with original category labels
print("Decision Tree:")
print_tree(decision_tree, headers)


Decision Tree:
relationship:
  age:
    occupation:
      age:
        -> 1
      workclass:
        race:
          age:
            -> 1
          workclass:
            marital-status:
              workclass:
                -> 1
              education-num:
                education:
                  age:
                    -> 1
                  workclass:
                    -> 1
                  occupation:
                    age:
                      age:
                        -> 1
                      workclass:
                        -> 0
                  relationship:
                    -> 1
                  race:
                    -> 1
                  hours-per-week:
                    -> 1
              marital-status:
                -> 1
              occupation:
                education:
                  age:
                    -> 1
                  workclass:
                    -> 1
                  education:
                    -> 1
          

IndexError: list index out of range

In [4]:
import csv
import math
from collections import defaultdict

# Function to load CSV data safely
def load_csv(filename):
    dataset = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        headers = next(reader)  # Read the header
        for row in reader:
            if len(row) == len(headers):  # Ensure correct row length
                dataset.append(row)
    return headers, dataset

# Function to calculate entropy
def calculate_entropy(data, target_index):
    target_values = [row[target_index] for row in data]
    unique_classes = set(target_values)
    entropy = 0
    for cls in unique_classes:
        prob = target_values.count(cls) / len(target_values)
        entropy -= prob * math.log2(prob)
    return entropy

# Function to calculate information gain
def information_gain(data, attr_index, target_index):
    total_entropy = calculate_entropy(data, target_index)
    values = set(row[attr_index] for row in data)
    weighted_entropy = 0

    for value in values:
        subset = [row for row in data if row[attr_index] == value]
        prob = len(subset) / len(data)
        weighted_entropy += prob * calculate_entropy(subset, target_index)

    return total_entropy - weighted_entropy

# Function to find the best attribute for splitting
def best_attribute(data, attributes, target_index):
    best_gain = -1
    best_attr = None

    for attr in attributes:
        gain = information_gain(data, attr, target_index)
        if gain > best_gain:
            best_gain = gain
            best_attr = attr

    return best_attr

# Function to build the decision tree
def build_tree(data, attributes, target_index):
    target_values = [row[target_index] for row in data]

    # If all examples have the same class, return that class
    if len(set(target_values)) == 1:
        return target_values[0]

    # If no attributes left, return the most common class
    if not attributes:
        return max(set(target_values), key=target_values.count)

    # Find the best attribute to split
    best_attr = best_attribute(data, attributes, target_index)
    tree = {best_attr: {}}

    # Get unique values of the best attribute
    attr_values = set(row[best_attr] for row in data)

    # Split dataset and recursively build subtrees
    for value in attr_values:
        subset = [row for row in data if row[best_attr] == value]
        new_attributes = attributes[:]
        new_attributes.remove(best_attr)
        tree[best_attr][value] = build_tree(subset, new_attributes, target_index)

    return tree

# Function to print the tree structure with readable labels
def print_tree(tree, headers, mappings, indent=""):
    if isinstance(tree, dict):
        for key, value in tree.items():
            print(f"{indent}{headers[key]}:")
            print_tree(value, headers, mappings, indent + "  ")
    else:
        print(f"{indent}-> {tree}")

# Load dataset
filename = "modified_dataset.csv"
headers, dataset = load_csv(filename)

# Convert categorical values into numerical labels and store mapping
attribute_mappings = defaultdict(dict)
for col in range(len(headers)):
    unique_values = list(set(row[col] for row in dataset))
    attribute_mappings[col] = {v: i for i, v in enumerate(unique_values)}

    # Replace categories with numerical labels
    for row in dataset:
        row[col] = attribute_mappings[col][row[col]]

# Define attributes (excluding the target column)
target_index = len(headers) - 1
attributes = list(range(len(headers) - 1))

# Build the decision tree
decision_tree = build_tree(dataset, attributes, target_index)

# Print the decision tree with original category labels
print("Decision Tree:")
print_tree(decision_tree, headers, attribute_mappings)


Decision Tree:
relationship:
  age:
    hours-per-week:
      age:
        marital-status:
          age:
            -> 1
          workclass:
            workclass:
              age:
                -> 1
              workclass:
                -> 1
              education:
                education:
                  age:
                    -> 0
                  workclass:
                    -> 1
                  education-num:
                    -> 1
              marital-status:
                -> 1
              occupation:
                -> 1
              relationship:
                -> 1
              sex:
                education:
                  workclass:
                    -> 1
                  occupation:
                    -> 1
                  relationship:
                    -> 1
                  race:
                    -> 1
                  capital-loss:
                    -> 1
                  hours-per-week:
                    -> 1
           

IndexError: list index out of range

In [1]:
import pandas as pd
import numpy as np
import math
import csv
import random

# Load the dataset
def load_csv(filename):
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        data = list(reader)
    headers = data[0]  # Extract headers
    return pd.DataFrame(data[1:], columns=headers)

df = load_csv("modified_dataset2.csv")

# Label Encoding (manual conversion of categorical attributes to integers)
def label_encode(df):
    encoders = {}
    for column in df.columns:
        unique_values = list(set(df[column]))
        mapping = {val: idx for idx, val in enumerate(unique_values)}
        df[column] = df[column].map(mapping)
        encoders[column] = mapping
    return df, encoders

df, label_encoders = label_encode(df)

# Split dataset into training and testing sets
def train_test_split_manual(df, test_size=0.2, random_seed=42):
    np.random.seed(random_seed)
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df) * test_size)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices], df.iloc[test_indices]

X = df.drop(columns=["class"])
y = df["class"]
df_train, df_test = train_test_split_manual(df)
X_train, y_train = df_train.drop(columns=["class"]), df_train["class"]
X_test, y_test = df_test.drop(columns=["class"]), df_test["class"]

# Compute entropy
def entropy(y):
    values, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

# Find the best split
def best_split(X, y):
    best_feature = None
    best_gain = -1
    base_entropy = entropy(y)
    
    for feature in X.columns:
        values = X[feature].unique()
        split_entropy = 0
        
        for value in values:
            subset_y = y[X[feature] == value]
            split_entropy += (len(subset_y) / len(y)) * entropy(subset_y)
        
        info_gain = base_entropy - split_entropy
        if info_gain > best_gain:
            best_gain = info_gain
            best_feature = feature
    
    return best_feature

# Build Decision Tree using ID3 algorithm
def build_tree(X, y):
    if len(np.unique(y)) == 1:
        return y.iloc[0]  # Pure node, return class label
    
    if X.empty:
        return y.mode()[0]  # Return most common label
    
    best_feature = best_split(X, y)
    if best_feature is None:
        return y.mode()[0]
    
    tree = {best_feature: {}}
    
    for value in X[best_feature].unique():
        subset_X = X[X[best_feature] == value].drop(columns=[best_feature])
        subset_y = y[X[best_feature] == value]
        tree[best_feature][value] = build_tree(subset_X, subset_y)
    
    return tree

decision_tree = build_tree(X_train, y_train)
print("Decision Tree:", decision_tree)

# Predict function
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree  # Leaf node
    
    feature = next(iter(tree))
    value = sample[feature]
    if value in tree[feature]:
        return predict(tree[feature][value], sample)
    else:
        return None  # Default case if value not found

# Evaluate accuracy
correct = 0
for i in range(len(X_test)):
    prediction = predict(decision_tree, X_test.iloc[i])
    if prediction == y_test.iloc[i]:
        correct += 1

accuracy = correct / len(y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Decision Tree: {'relationship': {3: {'education': {11: {'occupation': {1: {'workclass': {6: {'hours-per-week': {2: {'marital-status': {2: 1, 0: 1, 1: {'sex': {1: 1, 0: {'age': {0: {'race': {4: {'education-num': {1: {'capital-gain': {0: {'capital-loss': {0: {'native-country': {40: 1}}}}}}}}, 1: 1}}, 2: 1}}}}, 4: 1, 5: 1}}, 1: 1, 0: {'sex': {1: {'marital-status': {2: {'age': {1: 1, 0: {'race': {4: {'native-country': {2: 1, 40: {'education-num': {1: {'capital-gain': {0: {'capital-loss': {0: 1}}}}}}}}, 1: 1}}}}, 4: 1, 1: 1, 5: 1, 0: 1}}, 0: 1}}}}, 4: 1, 1: 1, 8: 1, 0: {'race': {4: 1, 1: 0}}, 7: 1}}, 2: {'workclass': {6: 1, 4: {'age': {2: 1, 0: 0}}, 8: 1}}, 5: {'age': {0: 1, 1: 1, 2: {'marital-status': {1: {'race': {4: {'sex': {1: {'native-country': {40: {'workclass': {6: {'education-num': {1: {'capital-gain': {0: {'capital-loss': {0: {'hours-per-week': {2: 1}}}}}}}}}}, 10: 1}}, 0: 1}}, 1: 0}}, 4: 1, 2: 1, 5: 1, 0: 1}}}}, 10: {'workclass': {6: {'age': {0: {'hours-per-week': {2: 1, 0: {'mari

In [2]:
import pandas as pd
import numpy as np
import math
import csv
import random

# Load the dataset
def load_csv(filename):
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        data = list(reader)
    headers = data[0]  # Extract headers
    return pd.DataFrame(data[1:], columns=headers)

df = load_csv("modified_dataset2.csv")

# Label Encoding (manual conversion of categorical attributes to integers)
def label_encode(df):
    encoders = {}
    for column in df.columns:
        unique_values = list(set(df[column]))
        mapping = {val: idx for idx, val in enumerate(unique_values)}
        df[column] = df[column].map(mapping)
        encoders[column] = mapping
    return df, encoders

df, label_encoders = label_encode(df)

# Split dataset into training and testing sets
def train_test_split_manual(df, test_size=0.2, random_seed=42):
    np.random.seed(random_seed)
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df) * test_size)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices], df.iloc[test_indices]

X = df.drop(columns=["class"])
y = df["class"]
df_train, df_test = train_test_split_manual(df)
X_train, y_train = df_train.drop(columns=["class"]), df_train["class"]
X_test, y_test = df_test.drop(columns=["class"]), df_test["class"]

# Compute entropy
def entropy(y):
    values, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

# Find the best split
def best_split(X, y):
    best_feature = None
    best_gain = -1
    base_entropy = entropy(y)
    
    for feature in X.columns:
        values = X[feature].unique()
        split_entropy = 0
        
        for value in values:
            subset_y = y[X[feature] == value]
            split_entropy += (len(subset_y) / len(y)) * entropy(subset_y)
        
        info_gain = base_entropy - split_entropy
        if info_gain > best_gain:
            best_gain = info_gain
            best_feature = feature
    
    return best_feature

# Build Decision Tree using ID3 algorithm
def build_tree(X, y):
    if len(np.unique(y)) == 1:
        return y.iloc[0]  # Pure node, return class label
    
    if X.empty:
        return y.mode()[0]  # Return most common label
    
    best_feature = best_split(X, y)
    if best_feature is None:
        return y.mode()[0]
    
    tree = {best_feature: {}}
    
    for value in X[best_feature].unique():
        subset_X = X[X[best_feature] == value].drop(columns=[best_feature])
        subset_y = y[X[best_feature] == value]
        tree[best_feature][value] = build_tree(subset_X, subset_y)
    
    return tree

def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "-> " + str(tree))
        return
    
    for feature, branches in tree.items():
        print(indent + str(feature))
        for value, subtree in branches.items():
            print(indent + f"  [{value}]")
            print_tree(subtree, indent + "    ")

decision_tree = build_tree(X_train, y_train)
print("Decision Tree:")
print_tree(decision_tree)

# Predict function
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree  # Leaf node
    
    feature = next(iter(tree))
    value = sample[feature]
    if value in tree[feature]:
        return predict(tree[feature][value], sample)
    else:
        return None  # Default case if value not found

# Evaluate accuracy
correct = 0
for i in range(len(X_test)):
    prediction = predict(decision_tree, X_test.iloc[i])
    if prediction == y_test.iloc[i]:
        correct += 1

accuracy = correct / len(y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Decision Tree:
relationship
  [3]
    education
      [11]
        occupation
          [1]
            workclass
              [6]
                hours-per-week
                  [2]
                    marital-status
                      [2]
                        -> 1
                      [0]
                        -> 1
                      [1]
                        sex
                          [1]
                            -> 1
                          [0]
                            age
                              [0]
                                race
                                  [4]
                                    education-num
                                      [1]
                                        capital-gain
                                          [0]
                                            capital-loss
                                              [0]
                                                native-country
                     