In [1]:
import pandas as pd
import math
from collections import Counter

# ======================================================
# Data Preprocessing
# ======================================================
def preprocess_data(df):
    # Filter to USD and USA for consistency
    df = df[(df['currency'] == 'usd') & (df['country'] == 'usa')]
    
    # Convert salary to categories
    def salary_category(salary):
        if salary < 50000:
            return 'Low'
        elif 50000 <= salary <= 100000:
            return 'Medium'
        else:
            return 'High'
    df['salary_category'] = df['annual salary'].apply(salary_category)
    
    # Select relevant features
    features = [
        'how old are you?',
        'industry',
        'overall years of professional experience',
        'highest level of education completed',
        'gender'
    ]
    target = 'salary_category'
    
    # Drop rows with missing values
    df = df[features + [target]].dropna()
    return df, features, target

# ======================================================
# ID3 Algorithm Implementation
# ======================================================
class DecisionNode:
    def __init__(self, feature=None, branches=None, label=None):
        self.feature = feature    # Feature to split on
        self.branches = branches  # Dict: {feature_value: child_node}
        self.label = label        # Leaf node label (salary category)

def entropy(labels):
    counts = Counter(labels)
    total = len(labels)
    return -sum((count / total) * math.log2(count / total) for count in counts.values())

def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values = data[feature].unique()
    weighted_entropy = 0
    
    for value in values:
        subset = data[data[feature] == value]
        subset_entropy = entropy(subset[target])
        weighted_entropy += (len(subset) / len(data)) * subset_entropy
    
    return total_entropy - weighted_entropy

def id3(data, features, target):
    # Base cases
    labels = data[target]
    if len(set(labels)) == 1:
        return DecisionNode(label=labels.iloc[0])
    if not features:
        majority_label = Counter(labels).most_common(1)[0][0]
        return DecisionNode(label=majority_label)
    
    # Select best feature (max information gain)
    gains = {feature: information_gain(data, feature, target) for feature in features}
    best_feature = max(gains, key=gains.get)
    
    # Recursively build the tree
    branches = {}
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        remaining_features = [f for f in features if f != best_feature]
        branches[value] = id3(subset, remaining_features, target)
    
    return DecisionNode(feature=best_feature, branches=branches)

def predict(tree, sample):
    if tree.label is not None:
        return tree.label
    feature_value = sample[tree.feature]
    if feature_value not in tree.branches:
        return Counter(tree.branches.values()).most_common(1)[0][0].label
    return predict(tree.branches[feature_value], sample)

# ======================================================
# Example Usage
# ======================================================
if __name__ == "__main__":
    # Load and preprocess data (replace with your actual dataset)
    df = pd.DataFrame([your_dataset_here])  # Use the provided dataset
    df, features, target = preprocess_data(df)
    
    # Split data (train/test)
    train = df.sample(frac=0.8, random_state=42)
    test = df.drop(train.index)
    
    # Build the tree
    tree = id3(train, features, target)
    
    # Evaluate accuracy
    correct = 0
    for _, row in test.iterrows():
        prediction = predict(tree, row)
        if prediction == row[target]:
            correct += 1
    print(f"Accuracy: {correct / len(test):.2f}")

    # Example prediction
    sample = {
        'how old are you?': '25-34',
        'industry': 'computing or tech',
        'overall years of professional experience': '5-7 years',
        'highest level of education completed': 'master\'s degree',
        'gender': 'woman'
    }
    print(f"Predicted Salary Category: {predict(tree, sample)}")

NameError: name 'your_dataset_here' is not defined