In [1]:
import pandas as pd
import math
from collections import Counter
import graphviz
from graphviz import Digraph

# ======================================================
# Data Preprocessing
# ======================================================
def preprocess_data(df):
    # Filter to USD and USA
    df = df[(df['currency'] == 'usd') & (df['country'] == 'usa')]
    
    # Convert salary to categories
    def salary_category(salary):
        if salary < 50000:
            return 'Low'
        elif 50000 <= salary <= 100000:
            return 'Medium'
        else:
            return 'High'
    df['salary_category'] = df['annual salary'].apply(salary_category)
    
    # Select features
    features = [
        'how old are you?',
        'industry',
        'overall years of professional experience',
        'highest level of education completed',
        'gender'
    ]
    target = 'salary_category'
    
    # Drop missing values
    df = df[features + [target]].dropna()
    return df, features, target

# ======================================================
# ID3 Algorithm Implementation
# ======================================================
class DecisionNode:
    def __init__(self, feature=None, branches=None, label=None):
        self.feature = feature
        self.branches = branches or {}
        self.label = label

def entropy(labels):
    counts = Counter(labels)
    total = len(labels)
    return -sum((count / total) * math.log2(count / total) for count in counts.values())

def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values = data[feature].unique()
    weighted_entropy = 0
    
    for value in values:
        subset = data[data[feature] == value]
        subset_entropy = entropy(subset[target])
        weighted_entropy += (len(subset) / len(data)) * subset_entropy
    
    return total_entropy - weighted_entropy

def id3(data, features, target):
    labels = data[target]
    
    # Base cases
    if len(set(labels)) == 1:
        return DecisionNode(label=labels.iloc[0])
    if not features:
        majority_label = Counter(labels).most_common(1)[0][0]
        return DecisionNode(label=majority_label)
    
    # Select best feature
    gains = {feature: information_gain(data, feature, target) for feature in features}
    best_feature = max(gains, key=gains.get)
    
    # Recursively build tree
    branches = {}
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        remaining_features = [f for f in features if f != best_feature]
        branches[value] = id3(subset, remaining_features, target)
    
    return DecisionNode(feature=best_feature, branches=branches)

# ======================================================
# Tree Visualization with Graphviz
# ======================================================
def visualize_tree(tree, feature_names=None):
    dot = Digraph(comment='Decision Tree')
    
    def add_node(node, parent=None, edge_label=None):
        node_id = str(id(node))
        if node.label is not None:
            dot.node(node_id, label=node.label, shape='box')
        else:
            dot.node(node_id, label=node.feature)
        
        if parent is not None:
            dot.edge(parent, node_id, label=edge_label)
        
        for value, child in node.branches.items():
            add_node(child, node_id, str(value))
    
    add_node(tree)
    return dot

# ======================================================
# Example Usage
# ======================================================
if __name__ == "__main__":
    # Load dataset (replace with your actual data)
    df = pd.read_excel('Ask A Manager Salary Survey 2021 (Responses).xlsx', sheet_name='Form Responses 1')
    
    # Preprocess data
    df, features, target = preprocess_data(df)
    
    # Build the tree
    tree = id3(df, features, target)
    
    # Visualize the tree
    dot = visualize_tree(tree)
    dot.render('salary_decision_tree', view=True, format='png')  # Saves as PNG
    print("Decision tree saved as 'salary_decision_tree.png'")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['salary_category'] = df['annual salary'].apply(salary_category)
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.444648 to fit


Decision tree saved as 'salary_decision_tree.png'
