<a href="https://colab.research.google.com/github/Ahmadrafifaqiri/Ahmadrafifaqiri/blob/main/HW_1%20Machine%20Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Decision Tree [40 points + 10 bonus]

In [69]:
import numpy as np
import pandas as pd

In [70]:
data = pd.DataFrame({
    'x1': [0, 0, 0, 1, 0, 1, 0],
    'x2': [0, 1, 0, 0, 1, 1, 1],
    'x3': [1, 0, 1, 0, 1, 0, 0],
    'x4': [0, 0, 1, 1, 0, 0, 1],
    'y': [0, 0, 1, 1, 0, 0, 0]
})

In [71]:
def entropy(y):
    probs = y.value_counts(normalize=True)
    return -np.sum(probs * np.log2(probs))

# Function to calculate information gain
def information_gain(data, attribute):
    # Calculate the entropy of the entire dataset
    total_entropy = entropy(data['y'])

    # Calculate the weighted entropy after splitting on the attribute
    values = data[attribute].unique()
    weighted_entropy = 0

    for value in values:
        subset = data[data[attribute] == value]
        prob = len(subset) / len(data)
        weighted_entropy += prob * entropy(subset['y'])

    # Information gain is the reduction in entropy
    return total_entropy - weighted_entropy

In [72]:
attributes = ['x1', 'x2', 'x3', 'x4']
for attr in attributes:
    gain = information_gain(data, attr)
    print(f'Information Gain for {attr}: {gain:.4f}')

Information Gain for x1: 0.0617
Information Gain for x2: 0.4696
Information Gain for x3: 0.0060
Information Gain for x4: 0.4696


In [73]:
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Mild', 'Cool', 'Mild', 'Mild', 'Cool', 'Cool', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong', 'Weak', 'Weak'],
    'Play': ['No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

def majority_error(df, column, target):
    """
    Calculate the Majority Error (ME) for a given attribute.
    """
    groups = df.groupby(column)[target].value_counts(normalize=True).unstack(fill_value=0)
    me = 1 - groups.max(axis=1)
    weighted_me = (df.groupby(column).size() / len(df)).dot(me)
    return weighted_me

def gini_index(df, column, target):
    """
    Calculate the Gini Index (GI) for a given attribute.
    """
    groups = df.groupby(column)[target].value_counts(normalize=True).unstack(fill_value=0)
    gini = 1 - (groups**2).sum(axis=1)
    weighted_gini = (df.groupby(column).size() / len(df)).dot(gini)
    return weighted_gini

In [74]:
# Calculate Majority Error for each attribute:
attributes = ['Outlook', 'Temperature', 'Humidity', 'Wind']
me_results = {attr: majority_error(df, attr, 'Play') for attr in attributes}

In [75]:
# Calculate Gini Index for each attribute:
gini_results = {attr: gini_index(df, attr, 'Play') for attr in attributes}

print("Majority Error for each attribute:")
for attr, me in me_results.items():
    print(f"{attr}: {me:.3f}")

print("\nGini Index for each attribute:")
for attr, gi in gini_results.items():
    print(f"{attr}: {gi:.3f}")

Majority Error for each attribute:
Outlook: 0.286
Temperature: 0.357
Humidity: 0.286
Wind: 0.357

Gini Index for each attribute:
Outlook: 0.343
Temperature: 0.388
Humidity: 0.367
Wind: 0.405


In [76]:
# Identify the best attribute based on ME and GI:
best_me_attr = min(me_results, key=me_results.get)
best_gini_attr = min(gini_results, key=gini_results.get)

print(f"\nBest attribute based on Majority Error: {best_me_attr}")
print(f"Best attribute based on Gini Index: {best_gini_attr}")


Best attribute based on Majority Error: Outlook
Best attribute based on Gini Index: Outlook


In [77]:
# Function to split dataset based on an attribute:
def split_dataset(df, column, value):
    """
    Split the dataset based on a given attribute and value.
    """
    return df[df[column] == value]

Split dataset based on the best attribute from Majority Error
For recursive splitting,repeat the process for each subset

In [78]:
subset = split_dataset(df, best_me_attr, df[best_me_attr].unique()[0])
print(f"\nSubset based on {best_me_attr} = {df[best_me_attr].unique()[0]}:")
print(subset)


Subset based on Outlook = Sunny:
   Outlook Temperature Humidity    Wind Play
0    Sunny         Hot     High    Weak   No
1    Sunny         Hot     High  Strong   No
7    Sunny        Mild     High    Weak   No
8    Sunny        Cool   Normal  Strong  Yes
10   Sunny        Mild   Normal    Weak  Yes


In [79]:
from collections import Counter
from math import log2

In [80]:
data = {
    'Outlook': ['Rain', 'Rain', 'Sunny', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Rain', 'Sunny', 'Rain', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Sunny'],
    'Temperature': ['Hot', 'Hot', 'Mild', 'Hot', 'Hot', 'Mild', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Cool', 'Mild', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'Normal', 'High', 'High', 'Normal', 'Normal', 'High', 'High', 'High'],
    'Wind': ['Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Strong', 'Weak'],
    'Play': ['No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'No']
}

df = pd.DataFrame(data)

new_instance = {'Outlook': None, 'Temperature': 'Mild', 'Humidity': 'Normal', 'Wind': 'Weak', 'Play': 'Yes'}
most_common_outlook = df['Outlook'].mode()[0]
df.loc[len(df)] = [most_common_outlook, new_instance['Temperature'], new_instance['Humidity'], new_instance['Wind'], new_instance['Play']]

def entropy(attribute):
    values, counts = np.unique(attribute, return_counts=True)
    probs = counts / len(attribute)
    return -np.sum(probs * np.log2(probs))


def information_gain(df, feature, target):
    total_entropy = entropy(df[target])
    values, counts = np.unique(df[feature], return_counts=True)
    weighted_entropy = np.sum([(counts[i] / np.sum(counts)) * entropy(df[df[feature] == values[i]][target]) for i in range(len(values))])
    return total_entropy - weighted_entropy

features = ['Outlook', 'Temperature', 'Humidity', 'Wind']
target = 'Play'
info_gains = {feature: information_gain(df, feature, target) for feature in features}

print("Information Gain for each feature:")
for feature, gain in info_gains.items():
    print(f"{feature}: {gain:.3f}")


best_feature = max(info_gains, key=info_gains.get)
print(f"\nThe best feature to split on is: {best_feature}")

Information Gain for each feature:
Outlook: 0.094
Temperature: 0.004
Humidity: 0.559
Wind: 0.041

The best feature to split on is: Humidity


In [81]:
data = {
    'Day': [1, 2, 7, 8, 9, 10, 11, 12, 14],
    'Outlook': ['Sunny', 'Sunny', 'Sunny', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Sunny', 'Sunny'],
    'Temperature': ['Hot', 'Hot', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild'],
    'Humidity': ['High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal'],
    'Wind': ['Weak', 'Strong', 'Strong', 'Weak', 'Strong', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes']
}

df = pd.DataFrame(data)

def entropy(y):
    """Calculate the entropy of labels."""
    proportions = y.value_counts(normalize=True)
    return -np.sum(proportions * np.log2(proportions))

def information_gain(df, feature, target):
    """Calculate the information gain of a feature."""
    # Calculate entropy of the target variable
    entropy_before = entropy(df[target])

    # Calculate weighted entropy after split
    values = df[feature].unique()
    entropy_after = 0
    for value in values:
        subset = df[df[feature] == value]
        entropy_after += (len(subset) / len(df)) * entropy(subset[target])

    # Information gain
    return entropy_before - entropy_after

# Calculate information gain for each feature
features = ['Outlook', 'Temperature', 'Humidity', 'Wind']
target = 'PlayTennis'

info_gains = {}
for feature in features:
    info_gains[feature] = information_gain(df, feature, target)

# Find the feature with the highest information gain
best_feature = max(info_gains, key=info_gains.get)
print(f"The best feature to split on is: {best_feature} with an information gain of {info_gains[best_feature]}")

# Build the tree for the best feature
def build_tree(df, feature, target):
    """Build a simple decision tree based on the best feature."""
    tree = {}
    values = df[feature].unique()
    for value in values:
        subset = df[df[feature] == value]
        if subset[target].nunique() == 1:
            tree[value] = subset[target].iloc[0]
        else:
            # Recursively build the tree for each subset
            subtree = {value: build_tree(subset, feature, target)}
            tree.update(subtree)
    return tree

# Build and print the decision tree
decision_tree = build_tree(df, best_feature, target)
print("Decision Tree:")
print(decision_tree)

The best feature to split on is: Humidity with an information gain of 0.9182958340544896
Decision Tree:
{'High': 'No', 'Normal': 'Yes'}


In [82]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [84]:
import pandas as pd
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

train_file = '/content/drive/My Drive/bank/train.csv'
test_file = '/content/drive/My Drive/bank/test.csv'
train_file = '/content/drive/My Drive/car/train.csv'
test_file = '/content/drive/My Drive/car/test.csv'


In [85]:
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [86]:
# Display the first few rows of the data

In [114]:
import pandas as pd

# Define file paths
train_file = '/content/drive/My Drive/car/train.csv'
test_file = '/content/drive/My Drive/car/test.csv'

# Load the datasets using the specified structure
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# Display the first few rows of the training DataFrame
train_df.head()

Unnamed: 0,low,vhigh,4,4.1,big,med,acc
0,low,high,5more,4,med,high,vgood
1,vhigh,med,2,2,big,high,unacc
2,high,high,2,2,small,high,unacc
3,vhigh,low,3,2,big,low,unacc
4,high,high,3,4,med,low,unacc


In [115]:

test_df.head()

Unnamed: 0,vhigh,high,5more,2,small,low,unacc
0,low,low,5more,2,small,med,unacc
1,low,vhigh,4,2,med,low,unacc
2,high,vhigh,3,4,med,med,unacc
3,vhigh,low,4,4,med,low,unacc
4,high,vhigh,5more,4,med,low,unacc


In [89]:
car_train_file = '/content/drive/My Drive/car/train.csv'
car_test_file = '/content/drive/My Drive/car/test.csv'
bank_train_file = '/content/drive/My Drive/bank/train.csv'
bank_test_file = '/content/drive/My Drive/bank/test.csv'

In [90]:
car_train_df.isnull().sum()
car_test_df.isnull().sum()

Unnamed: 0,0
vhigh,0
high,0
5more,0
2,0
small,0
low,0
unacc,0


In [91]:
print(car_train_df.columns)
print(car_test_df.columns)

Index(['low', 'vhigh', '4', '4.1', 'big', 'med', 'acc'], dtype='object')
Index(['vhigh', 'high', '5more', '2', 'small', 'low', 'unacc'], dtype='object')


In [92]:
car_X_train = pd.get_dummies(car_train_df.drop('acc', axis=1))
car_y_train = car_train_df['acc']

car_X_test = pd.get_dummies(car_test_df.drop('unacc', axis=1))
car_y_test = car_test_df['unacc']

In [93]:
car_X_train, car_X_test = car_X_train.align(car_X_test, join='left', axis=1, fill_value=0)

In [94]:
# Load the datasets
car_train_df = pd.read_csv(car_train_file)
car_test_df = pd.read_csv(car_test_file)
bank_train_df = pd.read_csv(bank_train_file)
bank_test_df = pd.read_csv(bank_test_file)

In [95]:
# Define a function to calculate entropy
def entropy(y):
    proportions = y.value_counts(normalize=True)
    return -np.sum(proportions * np.log2(proportions))

# Define a function to calculate information gain
def information_gain(X, y, feature):
    original_entropy = entropy(y)
    values = X[feature].unique()
    weighted_entropy = 0
    for value in values:
        subset_y = y[X[feature] == value]
        weighted_entropy += (len(subset_y) / len(y)) * entropy(subset_y)
    return original_entropy - weighted_entropy

# Define a function to calculate gini index
def gini_index(y):
    proportions = y.value_counts(normalize=True)
    return 1 - np.sum(proportions ** 2)

# Define a function to calculate gini gain
def gini_gain(X, y, feature):
    original_gini = gini_index(y)
    values = X[feature].unique()
    weighted_gini = 0
    for value in values:
        subset_y = y[X[feature] == value]
        weighted_gini += (len(subset_y) / len(y)) * gini_index(subset_y)
    return original_gini - weighted_gini

# Define the ID3 algorithm with heuristics
class DecisionTreeID3:
    def __init__(self, max_depth=None, criterion='information_gain'):
        self.max_depth = max_depth
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        if len(y.unique()) == 1:
            return y.mode()[0]

        if self.max_depth and depth >= self.max_depth:
            return y.mode()[0]

        if X.empty:
            return y.mode()[0]

        best_feature = self._choose_best_feature(X, y)
        tree = {best_feature: {}}
        for value in X[best_feature].unique():
            subset_X = X[X[best_feature] == value].drop(best_feature, axis=1)
            subset_y = y[X[best_feature] == value]
            tree[best_feature][value] = self._build_tree(subset_X, subset_y, depth + 1)
        return tree

    def _choose_best_feature(self, X, y):
        if self.criterion == 'information_gain':
            return max(X.columns, key=lambda feature: information_gain(X, y, feature))
        elif self.criterion == 'gini_index':
            return max(X.columns, key=lambda feature: gini_gain(X, y, feature))
        else:
            raise ValueError("Unsupported criterion: {}".format(self.criterion))

    def predict(self, X):
        return X.apply(lambda row: self._predict_single(row, self.tree), axis=1)

    def _predict_single(self, row, tree):
        if not isinstance(tree, dict):
            return tree

        feature = list(tree.keys())[0]
        value = row[feature]
        subtree = tree[feature].get(value, None)

        if subtree is None:
            return None

        return self._predict_single(row, subtree)

In [96]:
# One-hot encode the training and test datasets
car_X_train = pd.get_dummies(car_train_df.drop('acc', axis=1))
car_y_train = car_train_df['acc']

car_X_test = pd.get_dummies(car_test_df.drop('unacc', axis=1))
car_y_test = car_test_df['unacc']

# Ensure that the training and test sets have the same columns
car_X_test = car_X_test.reindex(columns=car_X_train.columns, fill_value=0)

# Define a function to evaluate the model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_accuracy = accuracy_score(y_train, train_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    return train_accuracy, test_accuracy

# Training and testing decision tree on car dataset
depths = range(1, 7)
criteria = ['information_gain', 'gini_index']

results = []

for criterion in criteria:
    for depth in depths:
        model = DecisionTreeID3(max_depth=depth, criterion=criterion)
        train_accuracy, test_accuracy = evaluate_model(model, car_X_train, car_y_train, car_X_test, car_y_test)
        results.append({'Depth': depth, 'Criterion': criterion, 'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy})

# Display results
results_df = pd.DataFrame(results)
print(results_df)

    Depth         Criterion  Train Accuracy  Test Accuracy
0       1  information_gain        0.698699       0.702889
1       2  information_gain        0.777778       0.222834
2       3  information_gain        0.807808       0.374140
3       4  information_gain        0.807808       0.257221
4       5  information_gain        0.865866       0.257221
5       6  information_gain        0.880881       0.257221
6       1        gini_index        0.698699       0.702889
7       2        gini_index        0.777778       0.222834
8       3        gini_index        0.807808       0.374140
9       4        gini_index        0.823824       0.331499
10      5        gini_index        0.858859       0.257221
11      6        gini_index        0.898899       0.299862


In [62]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)

    # Debugging print statements
    print("Train predictions:", train_predictions.unique())
    print("Test predictions:", test_predictions.unique())

    train_accuracy = accuracy_score(y_train, train_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)

    return train_accuracy, test_accuracy

In [27]:
print(bank_train_df.columns)
print(bank_test_df.columns)

Index(['41', 'services', 'married', 'secondary', 'no', '0', 'yes', 'no.1',
       'unknown', '5', 'may', '114', '2', '-1', '0.1', 'unknown.1', 'no.2'],
      dtype='object')
Index(['41', 'management', 'single', 'secondary', 'no', '764', 'no.1', 'no.2',
       'cellular', '12', 'jun', '230', '2', '-1', '0', 'unknown', 'no.3'],
      dtype='object')


In [28]:
print(bank_train_df.head())
print(bank_test_df.head())

   41     services   married  secondary  no     0  yes no.1   unknown   5  \
0  48  blue-collar    single  secondary  no   312  yes  yes  cellular   3   
1  55   technician   married  secondary  no  1938   no  yes  cellular  18   
2  54       admin.   married   tertiary  no    59  yes   no  cellular  10   
3  34   management    single   tertiary  no  2646   no   no  cellular  14   
4  49       admin.  divorced  secondary  no  1709  yes   no   unknown  12   

   may  114  2   -1  0.1 unknown.1 no.2  
0  feb  369  2   -1    0   unknown   no  
1  aug  193  1  386    3   success  yes  
2  jul  268  1   -1    0   unknown   no  
3  apr  142  1   -1    0   unknown  yes  
4  jun  106  1   -1    0   unknown   no  
   41    management   single  secondary  no   764 no.1 no.2   cellular  12  \
0  39   blue-collar  married  secondary  no    49  yes   no   cellular  14   
1  60       retired  married    primary  no     0   no   no  telephone  30   
2  31  entrepreneur   single   tertiary  no   247  

In [63]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)

    # Clean None values from predictions and true values
    train_predictions = pd.Series(train_predictions).replace({None: 'unknown'})
    test_predictions = pd.Series(test_predictions).replace({None: 'unknown'})
    y_train = y_train.replace({None: 'unknown'})
    y_test = y_test.replace({None: 'unknown'})

    # Debugging print statements
    print("Train predictions:", train_predictions.unique())
    print("Test predictions:", test_predictions.unique())

    train_accuracy = accuracy_score(y_train, train_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)

    return train_accuracy, test_accuracy

# Ensure there are no None values in y_train and y_test
bank_y_train = bank_y_train.replace({None: 'unknown'})
bank_y_test = bank_y_test.replace({None: 'unknown'})

# Continue with training and evaluation
results_bank = []

for criterion in criteria:
    for depth in range(1, 17):
        model = DecisionTreeID3(max_depth=depth, criterion=criterion)
        train_accuracy, test_accuracy = evaluate_model(model, bank_X_train, bank_y_train, bank_X_test, bank_y_test)
        results_bank.append({'Depth': depth, 'Criterion': criterion, 'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy})

# Display results
results_bank_df = pd.DataFrame(results_bank)
print(results_bank_df)

Train predictions: ['no' 'yes']
Test predictions: ['no']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' 'yes']
Train predictions: ['no' 'yes']
Test predictions: ['unknown' 'no' '

In [None]:
# Print the column names of the training and test data
print("Car Training Data Columns:", car_train_df.columns)
print("Car Test Data Columns:", car_test_df.columns)

Car Training Data Columns: Index(['low', 'vhigh', '4', '4.1', 'big', 'med', 'acc'], dtype='object')
Car Test Data Columns: Index(['vhigh', 'high', '5more', '2', 'small', 'low', 'unacc'], dtype='object')


In [66]:
# Load the datasets
car_train_df = pd.read_csv(car_train_file)
car_test_df = pd.read_csv(car_test_file)

# Print column names to verify
print("Car Training Data Columns:", car_train_df.columns)
print("Car Test Data Columns:", car_test_df.columns)

# Define a function to calculate entropy
def entropy(y):
    proportions = y.value_counts(normalize=True)
    return -np.sum(proportions * np.log2(proportions))

# Define a function to calculate information gain
def information_gain(X, y, feature):
    original_entropy = entropy(y)
    values = X[feature].unique()
    weighted_entropy = 0
    for value in values:
        subset_y = y[X[feature] == value]
        weighted_entropy += (len(subset_y) / len(y)) * entropy(subset_y)
    return original_entropy - weighted_entropy

# Define a function to calculate gini index
def gini_index(y):
    proportions = y.value_counts(normalize=True)
    return 1 - np.sum(proportions ** 2)

# Define a function to calculate gini gain
def gini_gain(X, y, feature):
    original_gini = gini_index(y)
    values = X[feature].unique()
    weighted_gini = 0
    for value in values:
        subset_y = y[X[feature] == value]
        weighted_gini += (len(subset_y) / len(y)) * gini_index(subset_y)
    return original_gini - weighted_gini

# Define the ID3 algorithm with heuristics
class DecisionTreeID3:
    def __init__(self, max_depth=None, criterion='information_gain'):
        self.max_depth = max_depth
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        if len(y.unique()) == 1:
            return y.mode()[0]

        if self.max_depth and depth >= self.max_depth:
            return y.mode()[0]

        if X.empty:
            return y.mode()[0]

        best_feature = self._choose_best_feature(X, y)
        tree = {best_feature: {}}
        for value in X[best_feature].unique():
            subset_X = X[X[best_feature] == value].drop(best_feature, axis=1)
            subset_y = y[X[best_feature] == value]
            tree[best_feature][value] = self._build_tree(subset_X, subset_y, depth + 1)
        return tree

Car Training Data Columns: Index(['low', 'vhigh', '4', '4.1', 'big', 'med', 'acc'], dtype='object')
Car Test Data Columns: Index(['vhigh', 'high', '5more', '2', 'small', 'low', 'unacc'], dtype='object')


a. ID3 Algorithm with Categorical Attributes

In [100]:

from sklearn.metrics import accuracy_score
from typing import Any, Dict, Tuple, Union

class DecisionTreeID3:
    def __init__(self, max_depth: int = None, criterion: str = 'information_gain'):
        self.max_depth = max_depth
        self.criterion = criterion
        self.tree = None

    def fit(self, X: pd.DataFrame, y: pd.Series):
        self.tree = self._fit(X, y, depth=0)

    def _fit(self, X: pd.DataFrame, y: pd.Series, depth: int) -> Dict[str, Any]:
        if len(set(y)) == 1:
            return {'label': y.iloc[0]}

        if self.max_depth is not None and depth >= self.max_depth:
            return {'label': y.mode()[0]}

        best_attribute = self._select_best_attribute(X, y)
        tree = {best_attribute: {}}
        for value in X[best_attribute].unique():
            sub_X = X[X[best_attribute] == value].drop(columns=[best_attribute])
            sub_y = y[X[best_attribute] == value]
            tree[best_attribute][value] = self._fit(sub_X, sub_y, depth + 1)

        return tree

    def _select_best_attribute(self, X: pd.DataFrame, y: pd.Series) -> str:
        if self.criterion == 'information_gain':
            return self._best_information_gain(X, y)
        elif self.criterion == 'gini_index':
            return self._best_gini_index(X, y)
        elif self.criterion == 'majority_error':
            return self._best_majority_error(X, y)
        else:
            raise ValueError("Unknown criterion")

    def _best_information_gain(self, X: pd.DataFrame, y: pd.Series) -> str:
        base_entropy = self._entropy(y)
        best_gain = 0
        best_attribute = None
        for attribute in X.columns:
            new_entropy = 0
            for value in X[attribute].unique():
                sub_y = y[X[attribute] == value]
                new_entropy += (len(sub_y) / len(y)) * self._entropy(sub_y)
            info_gain = base_entropy - new_entropy
            if info_gain > best_gain:
                best_gain = info_gain
                best_attribute = attribute
        return best_attribute

    def _best_gini_index(self, X: pd.DataFrame, y: pd.Series) -> str:
        best_gini = float('inf')
        best_attribute = None
        for attribute in X.columns:
            gini = 0
            for value in X[attribute].unique():
                sub_y = y[X[attribute] == value]
                prob = len(sub_y) / len(y)
                gini += prob * (1 - sum([(sub_y.value_counts() / len(sub_y))**2]))
            if gini < best_gini:
                best_gini = gini
                best_attribute = attribute
        return best_attribute

    def _best_majority_error(self, X: pd.DataFrame, y: pd.Series) -> str:
        best_error = float('inf')
        best_attribute = None
        for attribute in X.columns:
            error = 0
            for value in X[attribute].unique():
                sub_y = y[X[attribute] == value]
                majority_class = sub_y.mode()[0]
                error += (len(sub_y) / len(y)) * (1 - (sub_y == majority_class).mean())
            if error < best_error:
                best_error = error
                best_attribute = attribute
        return best_attribute

    def _entropy(self, y: pd.Series) -> float:
        probs = y.value_counts(normalize=True)
        return -sum(probs * np.log2(probs + 1e-9))

    def predict(self, X: pd.DataFrame) -> pd.Series:
        return X.apply(self._predict_row, axis=1)

    def _predict_row(self, row: pd.Series) -> Any:
        tree = self.tree
        while isinstance(tree, dict):
            attribute = list(tree.keys())[0]
            value = row[attribute]
            tree = tree[attribute].get(value, {'label': 'unknown'})
        return tree['label']


b. Evaluate Decision Trees for Car Datase

In [112]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from typing import Any, Dict, Tuple

class DecisionTreeID3:
    def __init__(self, max_depth: int = None, criterion: str = 'information_gain'):
        self.max_depth = max_depth
        self.criterion = criterion
        self.tree = None
        self.numerical_attributes = []
        self.categorical_attributes = []

    def fit(self, X: pd.DataFrame, y: pd.Series):
        self.numerical_attributes = [col for col in X.columns if X[col].dtype in [np.float64, np.int64]]
        self.categorical_attributes = [col for col in X.columns if X[col].dtype == object]
        self.tree = self._fit(X, y, depth=0)
        print("Decision Tree Structure:", self.tree)  # Debug print

    def _fit(self, X: pd.DataFrame, y: pd.Series, depth: int) -> Dict[str, Any]:
        if len(set(y)) == 1:
            return {'label': y.iloc[0]}

        if self.max_depth is not None and depth >= self.max_depth:
            return {'label': y.mode()[0]}

        best_attribute = self._select_best_attribute(X, y)
        tree = {best_attribute: {}}

        if best_attribute in self.numerical_attributes:
            median = X[best_attribute].median()
            for condition in [lambda x: x <= median, lambda x: x > median]:
                sub_X = X[X[best_attribute].apply(condition)].drop(columns=[best_attribute])
                sub_y = y[X[best_attribute].apply(condition)]
                tree[best_attribute][condition.__name__] = self._fit(sub_X, sub_y, depth + 1)
        else:
            for value in X[best_attribute].unique():
                sub_X = X[X[best_attribute] == value].drop(columns=[best_attribute])
                sub_y = y[X[best_attribute] == value]
                tree[best_attribute][value] = self._fit(sub_X, sub_y, depth + 1)

        return tree

    def _select_best_attribute(self, X: pd.DataFrame, y: pd.Series) -> str:
        if self.criterion == 'information_gain':
            return self._best_information_gain(X, y)
        elif self.criterion == 'gini_index':
            return self._best_gini_index(X, y)
        elif self.criterion == 'majority_error':
            return self._best_majority_error(X, y)
        else:
            raise ValueError("Unknown criterion")

    def _best_information_gain(self, X: pd.DataFrame, y: pd.Series) -> str:
        base_entropy = self._entropy(y)
        best_gain = 0
        best_attribute = None
        for attribute in X.columns:
            new_entropy = 0
            for value in X[attribute].unique():
                sub_y = y[X[attribute] == value]
                new_entropy += (len(sub_y) / len(y)) * self._entropy(sub_y)
            info_gain = base_entropy - new_entropy
            if info_gain > best_gain:
                best_gain = info_gain
                best_attribute = attribute
        return best_attribute

    def _best_gini_index(self, X: pd.DataFrame, y: pd.Series) -> str:
        best_gini = float('inf')
        best_attribute = None
        for attribute in X.columns:
            gini = 0
            for value in X[attribute].unique():
                sub_y = y[X[attribute] == value]
                prob = len(sub_y) / len(y)
                gini += prob * (1 - sum([(sub_y.value_counts() / len(sub_y))**2]))
            if gini < best_gini:
                best_gini = gini
                best_attribute = attribute
        return best_attribute

    def _best_majority_error(self, X: pd.DataFrame, y: pd.Series) -> str:
        best_error = float('inf')
        best_attribute = None
        for attribute in X.columns:
            error = 0
            for value in X[attribute].unique():
                sub_y = y[X[attribute] == value]
                majority_class = sub_y.mode()[0]
                error += (len(sub_y) / len(y)) * (1 - (sub_y == majority_class).mean())
            if error < best_error:
                best_error = error
                best_attribute = attribute
        return best_attribute

    def _entropy(self, y: pd.Series) -> float:
        probs = y.value_counts(normalize=True)
        return -sum(probs * np.log2(probs + 1e-9))

    def predict(self, X: pd.DataFrame) -> pd.Series:
        return X.apply(self._predict_row, axis=1)

    def _predict_row(self, row: pd.Series) -> Any:
        tree = self.tree
        while isinstance(tree, dict):
            attribute = list(tree.keys())[0]
            value = row[attribute]
            if attribute in self.numerical_attributes:
                condition = lambda x: x <= row[attribute] if 'le' in list(tree[attribute].keys())[0] else lambda x: x > row[attribute]
                next_node = tree[attribute].get(condition.__name__)
            else:
                next_node = tree[attribute].get(value)

            if isinstance(next_node, dict):
                tree = next_node
            else:
                return next_node.get('label', 'unknown')
        return tree.get('label', 'unknown')

def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_accuracy = accuracy_score(y_train, train_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    return train_accuracy, test_accuracy

# Load car dataset with the correct paths
car_train_df = pd.read_csv('/content/drive/My Drive/car/train.csv')
car_test_df = pd.read_csv('/content/drive/My Drive/car/test.csv')

# Print column names to check if 'acc' is present
print("Training Data Columns:", car_train_df.columns)
print("Test Data Columns:", car_test_df.columns)

# Ensure the 'acc' column exists in the training dataset and map to the test dataset
if 'acc' not in car_train_df.columns:
    raise KeyError("'acc' column not found in the training dataset")

# Map the training columns to the test dataset columns
car_X_train = car_train_df.drop('acc', axis=1)
car_y_train = car_train_df['acc']
car_X_test = car_test_df.copy()
car_y_test = car_test_df['unacc']  # Assuming 'unacc' is the label column in the test set

# Align the columns of the test dataset with the training dataset
car_X_test = car_X_test.reindex(columns=car_X_train.columns, fill_value='missing')

depths = range(1, 7)
criteria = ['information_gain', 'gini_index', 'majority_error']

results = []

for criterion in criteria:
    for depth in depths:
        model = DecisionTreeID3(max_depth=depth, criterion=criterion)
        train_accuracy, test_accuracy = evaluate_model(model, car_X_train, car_y_train, car_X_test, car_y_test)
        results.append({'Depth': depth, 'Criterion': criterion, 'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy})

# Display results
results_df = pd.DataFrame(results)
print(results_df)

Training Data Columns: Index(['low', 'vhigh', '4', '4.1', 'big', 'med', 'acc'], dtype='object')
Test Data Columns: Index(['vhigh', 'high', '5more', '2', 'small', 'low', 'unacc'], dtype='object')
Decision Tree Structure: {'med': {'high': {'label': 'unacc'}, 'low': {'label': 'unacc'}, 'med': {'label': 'unacc'}}}


KeyError: 'label'

#3. Modify for Numerical Attributes and Missing Values.

a. Numerical Attributes

In [None]:
class DecisionTreeID3:
    def __init__(self, max_depth: int = None, criterion: str = 'information_gain'):
        self.max_depth = max_depth
        self.criterion = criterion
        self.tree = None
        self.numerical_attributes = []

    def fit(self, X: pd.DataFrame, y: pd.Series):
        self.numerical_attributes = [col for col in X.columns if X[col].dtype in [np.float64, np.int64]]
        self.tree = self._fit(X, y, depth=0)

    def _fit(self, X: pd.DataFrame, y: pd.Series, depth: int) -> Dict[str, Any]:
        if len(set(y)) == 1:
            return {'label': y.iloc[0]}

        if self.max_depth is not None and depth >= self.max_depth:
            return {'label': y.mode()[0]}

        best_attribute = self._select_best_attribute(X, y)
        tree = {best_attribute: {}}
        if best_attribute in self.numerical_attributes:
            median = X[best_attribute].median()
            for condition in [lambda x: x <= median, lambda x: x > median]:
                sub_X = X[X[best_attribute].apply(condition)].drop(columns=[best_attribute])
                sub_y = y[X[best_attribute].apply(condition)]
                tree[best_attribute][condition.__name__] = self._fit(sub_X, sub_y, depth + 1)
        else:
            for value in X[best_attribute].unique():
                sub_X = X[X[best_attribute] == value].drop(columns=[best_attribute])
                sub_y = y[X[best_attribute] == value]
                tree[best_attribute][value] = self._fit(sub_X, sub_y, depth + 1)

        return tree

    def _predict_row(self, row: pd.Series) -> Any:
        tree = self.tree
        while isinstance(tree, dict):
            attribute = list(tree.keys())[0]
            value = row[attribute]
            if attribute in self.numerical_attributes:
                condition = lambda x: x <= row[attribute] if 'le' in list(tree[attribute].keys())[0] else lambda x: x > row[attribute]
                tree = tree[attribute].get(condition.__name__, {'label': 'unknown'})
            else:
                tree = tree[attribute].get(value, {'label': 'unknown'})
        return tree['label']

b. Missing Values and Evaluation for Bank Dataset

In [103]:
# Load bank dataset
bank_train_df = pd.read_csv('path_to_bank_train.csv')
bank_test_df = pd.read_csv('path_to_bank_test.csv')

# Handle missing values
def handle_missing_values(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column].replace('unknown', np.nan, inplace=True)
            df[column].fillna(df[column].mode()[0], inplace=True)
    return df

bank_train_df = handle_missing_values(bank_train_df)
bank_test_df = handle_missing_values(bank_test_df)

# Prepare data
bank_X_train = bank_train_df.drop('y', axis=1)
bank_y_train = bank_train_df['y']
bank_X_test = bank_test_df.drop('y', axis=1)
bank_y_test = bank_test_df['y']

results_bank = []

for criterion in criteria:
    for depth in range(1, 17):
        model = DecisionTreeID3(max_depth=depth, criterion=criterion)
        train_accuracy, test_accuracy = evaluate_model(model, bank_X_train, bank_y_train, bank_X_test, bank_y_test)
        results_bank.append({'Depth': depth, 'Criterion': criterion, 'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy})

# Display results
results_bank_df = pd.DataFrame(results_bank)
print(results_bank_df)

FileNotFoundError: [Errno 2] No such file or directory: 'path_to_bank_train.csv'

In [104]:
print("Training Data Columns:", car_train_df.columns)
print("Test Data Columns:", car_test_df.columns)


Training Data Columns: Index(['low', 'vhigh', '4', '4.1', 'big', 'med', 'acc'], dtype='object')
Test Data Columns: Index(['vhigh', 'high', '5more', '2', 'small', 'low', 'unacc'], dtype='object')


In [105]:
car_train_df.columns = car_train_df.columns.str.strip()
car_test_df.columns = car_test_df.columns.str.strip()

In [106]:
# Load car dataset with the correct paths
car_train_df = pd.read_csv('/content/drive/My Drive/car/train.csv')
car_test_df = pd.read_csv('/content/drive/My Drive/car/test.csv')

# Debug column names
print("Training Data Columns:", car_train_df.columns)
print("Test Data Columns:", car_test_df.columns)

# Strip any extra spaces from column names
car_train_df.columns = car_train_df.columns.str.strip()
car_test_df.columns = car_test_df.columns.str.strip()

# Align the columns of the test dataset with the training dataset
car_test_df = car_test_df.reindex(columns=car_train_df.columns, fill_value='missing')

# Prepare data
car_X_train = car_train_df.drop('acc', axis=1)
car_y_train = car_train_df['acc']
car_X_test = car_test_df.drop('acc', axis=1)
car_y_test = car_test_df['acc']


Training Data Columns: Index(['low', 'vhigh', '4', '4.1', 'big', 'med', 'acc'], dtype='object')
Test Data Columns: Index(['vhigh', 'high', '5more', '2', 'small', 'low', 'unacc'], dtype='object')


In [107]:
# Identify columns in training set but not in test set
missing_cols = [col for col in car_X_train.columns if col not in car_X_test.columns]
print("Missing Columns in Test Set:", missing_cols)

Missing Columns in Test Set: []
