In [1]:
import pandas as pd

# Define age categories
def categorize_age(age):
    if age <= 12:
        return "Child"
    elif age <= 19:
        return "Teen"
    elif age <= 50:
        return "Middle-aged"
    else:
        return "Old-aged"

# Convert numeric education levels into categorical descriptions
def categorize_education(num):
    if num >= 13:
        return "Higher"
    elif num >= 9:
        return "Medium"
    else:
        return "Lower"

# Convert capital gain/loss into categories
def categorize_capital(value):
    return "High" if value > 250000 else "None"

# Read the CSV file
df = pd.read_csv("adult.csv")

# Apply transformations
df["age"] = df["age"].apply(categorize_age)
df["education-num"] = df["education-num"].apply(categorize_education)
df["capital-gain"] = df["capital-gain"].apply(categorize_capital)
df["capital-loss"] = df["capital-loss"].apply(categorize_capital)
df["class"] = df["class"].replace({"<=50K": "No", ">50K": "Yes"})

# Drop the fnlwgt column
df.drop(columns=["fnlwgt"], inplace=True)
# Save the modified dataset
df.to_csv("modified_dataset.csv", index=False)

print("Dataset modified and saved as modified_dataset.csv")


Dataset modified and saved as modified_dataset.csv


In [2]:
import pandas as pd

# Define age categories
def categorize_age(age):
    if age <= 12:
        return "Child"
    elif age <= 19:
        return "Teen"
    elif age <= 50:
        return "Middle-aged"
    else:
        return "Old-aged"

# Convert numeric education levels into categorical descriptions
def categorize_education(num):
    if num >= 13:
        return "Higher"
    elif num >= 9:
        return "Medium"
    else:
        return "Lower"

# Convert capital gain/loss into categories
def categorize_capital(value):
    return "High" if value > 250000 else "Low"

# Categorize hours per week
def categorize_hours(hours):
    if hours <= 20:
        return "Part-time"
    elif hours <= 40:
        return "Full-time"
    else:
        return "Overtime"

# Read the CSV file
df = pd.read_csv("adult.csv")

# Apply transformations
df["age"] = df["age"].apply(categorize_age)
df["education-num"] = df["education-num"].apply(categorize_education)
df["capital-gain"] = df["capital-gain"].apply(categorize_capital)
df["capital-loss"] = df["capital-loss"].apply(categorize_capital)
df["hours-per-week"] = df["hours-per-week"].apply(categorize_hours)
df["class"] = df["class"].replace({"<=50K": "No", ">50K": "Yes"})

# Drop the fnlwgt column
df.drop(columns=["fnlwgt"], inplace=True)

# Save the modified dataset
df.to_csv("modified_dataset2.csv", index=False)

print("Dataset modified and saved as modified_dataset.csv")


Dataset modified and saved as modified_dataset.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder

# Load the modified dataset
df = pd.read_csv("modified_dataset2.csv")

# Encode categorical attributes using Label Encoding
label_encoders = {}
for column in df.columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Store the encoders for inverse transformation if needed

# Separate features (X) and target variable (y)
X = df.drop(columns=["class"])  # All columns except "class"
y = df["class"]  # Target column

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Decision Tree using ID3 (entropy as criterion)
clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
clf.fit(X_train, y_train)

# Print the decision tree
tree_rules = export_text(clf, feature_names=X.columns.tolist())
print("Decision Tree:\n", tree_rules)

# Evaluate the model
accuracy = clf.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Decision Tree:
 |--- relationship <= 0.50
|   |--- education-num <= 0.50
|   |   |--- hours-per-week <= 1.50
|   |   |   |--- education <= 9.50
|   |   |   |   |--- hours-per-week <= 0.50
|   |   |   |   |   |--- occupation <= 3.50
|   |   |   |   |   |   |--- native-country <= 1.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- native-country >  1.00
|   |   |   |   |   |   |   |--- native-country <= 6.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- native-country >  6.00
|   |   |   |   |   |   |   |   |--- occupation <= 2.00
|   |   |   |   |   |   |   |   |   |--- native-country <= 28.00
|   |   |   |   |   |   |   |   |   |   |--- workclass <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |   |--- workclass >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- native-country >  28.00
|   |   |   |   |   |   |   |   |   

In [5]:
import csv
import math

# Function to load CSV data
def load_csv(filename):
    dataset = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        headers = next(reader)  # Read the header
        for row in reader:
            dataset.append(row)
    return headers, dataset

# Function to calculate entropy
def calculate_entropy(data, target_index):
    target_values = [row[target_index] for row in data]
    unique_classes = set(target_values)
    entropy = 0
    for cls in unique_classes:
        prob = target_values.count(cls) / len(target_values)
        entropy -= prob * math.log2(prob)
    return entropy

# Function to calculate information gain
def information_gain(data, attr_index, target_index):
    total_entropy = calculate_entropy(data, target_index)
    values = set(row[attr_index] for row in data)
    weighted_entropy = 0

    for value in values:
        subset = [row for row in data if row[attr_index] == value]
        prob = len(subset) / len(data)
        weighted_entropy += prob * calculate_entropy(subset, target_index)

    return total_entropy - weighted_entropy

# Function to find the best attribute for splitting
def best_attribute(data, attributes, target_index):
    best_gain = -1
    best_attr = None

    for attr in attributes:
        gain = information_gain(data, attr, target_index)
        if gain > best_gain:
            best_gain = gain
            best_attr = attr

    return best_attr

# Function to build the decision tree
def build_tree(data, attributes, target_index):
    target_values = [row[target_index] for row in data]

    # If all examples have the same class, return that class
    if len(set(target_values)) == 1:
        return target_values[0]

    # If no attributes left, return the most common class
    if not attributes:
        return max(set(target_values), key=target_values.count)

    # Find the best attribute to split
    best_attr = best_attribute(data, attributes, target_index)
    tree = {best_attr: {}}

    # Get unique values of the best attribute
    attr_values = set(row[best_attr] for row in data)

    # Split dataset and recursively build subtrees
    for value in attr_values:
        subset = [row for row in data if row[best_attr] == value]
        new_attributes = attributes[:]
        new_attributes.remove(best_attr)
        tree[best_attr][value] = build_tree(subset, new_attributes, target_index)

    return tree

# Function to print the tree structure
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        for key, value in tree.items():
            print(f"{indent}{key}:")
            print_tree(value, indent + "  ")
    else:
        print(f"{indent}-> {tree}")

# Load dataset
filename = "modified_dataset2.csv"
headers, dataset = load_csv(filename)

# Convert categorical values into numbers
for col in range(len(headers)):
    unique_values = list(set(row[col] for row in dataset))
    for row in dataset:
        row[col] = unique_values.index(row[col])  # Encode category as index

# Define attributes (excluding the target column)
target_index = len(headers) - 1
attributes = list(range(len(headers) - 1))

# Build the decision tree
decision_tree = build_tree(dataset, attributes, target_index)

# Print the decision tree
print("Decision Tree:")
print_tree(decision_tree)


Decision Tree:
6:
  0:
    4:
      0:
        -> 1
      1:
        2:
          0:
            5:
              0:
                1:
                  0:
                    -> 0
                  1:
                    8:
                      0:
                        -> 1
                      1:
                        7:
                          0:
                            -> 1
                          1:
                            11:
                              0:
                                -> 1
                              1:
                                12:
                                  17:
                                    -> 1
                                  26:
                                    0:
                                      1:
                                        3:
                                          1:
                                            9:
                                              0:
                         