<a href="https://colab.research.google.com/github/AsraniSanjana/All_Codes/blob/main/All_Semester_Codes/ML_sem7/models/final_id3_c4_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import pandas as pd
import math



# Define the Node class
class Node:
    def __init__(self, value=None):
        self.value = value
        self.branches = {}

# Calculate the split info for a dataset
def calculate_split_info(data_df, attribute):
    attribute_values = data_df[attribute].unique()
    total_instances = len(data_df)
    split_info = 0
    for value in attribute_values:
        subset = data_df[data_df[attribute] == value]
        subset_instances = len(subset)
        ratio = subset_instances / total_instances
        split_info -= ratio * math.log2(ratio)
    return split_info

# Function to get gain ratio
def get_gain(attribute_name):
    attribute_column = data_df[attribute_name]

    total_entropy = calculate_entropy(data_df, target_column)
    attribute_values = attribute_column.unique()

    entropies = []
    weighted_entropies = []
    values_counts = {}
    for value in attribute_values:
        subset = data_df[data_df[attribute_name] == value]
        subset_entropy = calculate_entropy(subset, target_column)
        entropies.append(subset_entropy)

        weight = len(subset) / len(attribute_column)
        weighted_entropy = subset_entropy * weight
        weighted_entropies.append(weighted_entropy)
        values_counts[value] = len(subset)

    total_weighted_entropy = sum(weighted_entropies)
    info_gain = total_entropy - total_weighted_entropy

    split_info = 0
    total_instances = len(data_df)
    for value_count in values_counts.values():
        ratio = value_count / total_instances
        split_info -= ratio * math.log2(ratio)

    gain_ratio = info_gain / split_info

    header = [attribute_name] + list(data_df[target_column].unique()) + ['Total', 'Entropy', 'Weight']
    column_width = max(len(name) for name in header) + 2
    table_format = '|'.join('{{:<{}}}'.format(column_width) for _ in range(len(header)))

    print("Values Counts, Entropies, and Weights for '{}' attribute:".format(attribute_name))
    print(table_format.format(*header))
    print('|' + '=' * (column_width * len(header) + len(header) - 1) + '|')

    for value in values_counts:
        subset = data_df[data_df[attribute_name] == value]
        class_counts = [subset[subset[target_column] == label].shape[0] for label in header[1:-3]]
        total_count = values_counts[value]
        subset_entropy = calculate_entropy(subset, target_column)
        weight = total_count / total_instances
        print(table_format.format(value, *class_counts, total_count, subset_entropy, weight))

    print(f"\nHence, Gain({attribute_name}) becomes {total_entropy:.4f} - {info_gain:.4f} = {info_gain:.4f}")
    print(f"Split Info({attribute_name}) = {split_info:.4f}")
    print(f"Gain Ratio({attribute_name}) = {info_gain:.4f} / {split_info:.4f} = {gain_ratio:.4f}")
    print("\n" + "=" * 119)


# Calculate the entropy of a dataset
def calculate_entropy(data_df, target_column):
    class_counts = data_df[target_column].value_counts()
    total_instances = len(data_df)
    entropy = 0
    for class_count in class_counts:
        class_probability = class_count / total_instances
        entropy -= class_probability * math.log2(class_probability)
    return entropy

# Calculate the information gain for an attribute
def calculate_information_gain(data_df, attribute, target_column):
    total_entropy = calculate_entropy(data_df, target_column)
    attribute_values = data_df[attribute].unique()
    weighted_entropy = 0
    for value in attribute_values:
        subset = data_df[data_df[attribute] == value]
        subset_entropy = calculate_entropy(subset, target_column)
        value_probability = len(subset) / len(data_df)
        weighted_entropy += value_probability * subset_entropy
    information_gain = total_entropy - weighted_entropy
    return information_gain

# Function to print intermediate results
def print_intermediate_results(attribute, information_gain, selected_tuples):
    print(f"Chosen Attribute: {attribute}")
    print(f"Information Gain: {information_gain:.4f}")
    print(f"Selected Tuples:")
    print(selected_tuples)
    print("=" * 40)


# Build the decision tree recursively
def build_decision_tree(data_df, target_column, attributes):
    # Stopping conditions
    if data_df[target_column].nunique() == 1:
        return Node(data_df[target_column].iloc[0])
    if len(attributes) == 0:
        majority_class = data_df[target_column].mode()[0]
        return Node(majority_class)

    # Calculate information gain for each attribute
    information_gains = [calculate_information_gain(data_df, attribute, target_column) for attribute in attributes]
    selected_attribute_idx = information_gains.index(max(information_gains))
    selected_attribute = attributes[selected_attribute_idx]

    root = Node(selected_attribute)
    remaining_attributes = [attribute for idx, attribute in enumerate(attributes) if idx != selected_attribute_idx]
    attribute_values = data_df[selected_attribute].unique()
    for value in attribute_values:
        subset = data_df[data_df[selected_attribute] == value]
        if len(subset) == 0:
            majority_class = data_df[target_column].mode()[0]
            root.branches[value] = Node(majority_class)
        else:
            print_intermediate_results(selected_attribute, information_gains[selected_attribute_idx], subset)
            root.branches[value] = build_decision_tree(subset, target_column, remaining_attributes)

    return root


your_dataset_name='weather_forecast.csv'
last_class_colmn='Play'

# your_dataset_name='happy_sad.csv'
# last_class_colmn='emotions'

# your_dataset_name='Buy_Computer.csv'
# last_class_colmn='Buy_Computer'


data_df = pd.read_csv(your_dataset_name)
feature_names=list(data_df.columns[data_df.columns != last_class_colmn])   # all columns except the last class colmn
feature_names = [feature for feature in feature_names if feature != 'id']  # Exclude 'id' attribute
attributes=feature_names
target_column=last_class_colmn

decision_tree = build_decision_tree(data_df, last_class_colmn, feature_names)

# Function to print the decision tree
def print_decision_tree(node, depth=0):
    if node.value is not None:
        print("  " * depth, node.value)
    for value, branch in node.branches.items():
        print("  " * (depth + 1), value, end=": ")
        print_decision_tree(branch, depth + 1)


# Call the get_gain_ratio function for each attribute
for attribute in feature_names:
    get_gain(attribute)



print_decision_tree(decision_tree)






Chosen Attribute: Outlook
Information Gain: 0.2467
Selected Tuples:
   Outlook Temperature Humidity   Windy Play
0    Sunny         Hot     High    Weak   No
1    Sunny         Hot     High  Strong   No
7    Sunny        Mild     High    Weak   No
8    Sunny        Cool   Normal    Weak  Yes
10   Sunny        Mild   Normal  Strong  Yes
Chosen Attribute: Humidity
Information Gain: 0.9710
Selected Tuples:
  Outlook Temperature Humidity   Windy Play
0   Sunny         Hot     High    Weak   No
1   Sunny         Hot     High  Strong   No
7   Sunny        Mild     High    Weak   No
Chosen Attribute: Humidity
Information Gain: 0.9710
Selected Tuples:
   Outlook Temperature Humidity   Windy Play
8    Sunny        Cool   Normal    Weak  Yes
10   Sunny        Mild   Normal  Strong  Yes
Chosen Attribute: Outlook
Information Gain: 0.2467
Selected Tuples:
     Outlook Temperature Humidity   Windy Play
2   Overcast         Hot     High    Weak  Yes
6   Overcast        Cool   Normal  Strong  Yes
11  