<a href="https://colab.research.google.com/github/AdityaTheEmpire/MLLAB/blob/main/MLEXP3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from pprint import pprint
from sklearn.feature_selection import mutual_info_classif
from collections import Counter


def id3(df, target_attribute, attribute_names, default_class=None):
    cnt = Counter(x for x in df[target_attribute])


    if len(cnt) == 1:
        return next(iter(cnt))


    elif df.empty or (not attribute_names):
        return default_class

    else:

        gainz = mutual_info_classif(df[attribute_names], df[target_attribute],
                                    discrete_features=True)
        index_of_max = gainz.tolist().index(max(gainz))
        best_attr = attribute_names[index_of_max]


        tree = {best_attr: {}}


        remaining_attribute_names = [i for i in attribute_names if i != best_attr]


        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset, target_attribute, remaining_attribute_names, default_class)
            tree[best_attr][attr_val] = subtree

        return tree


def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attr = next(iter(tree))
    if sample[attr] in tree[attr]:
        return classify(tree[attr][sample[attr]], sample)
    else:
        return None


data = {
    "Outlook": ["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", "Overcast", "Sunny",
                "Sunny", "Rain", "Sunny", "Overcast", "Overcast"],
    "Temperature": ["Hot", "Hot", "Hot", "Mild", "Mild", "Mild", "Mild", "Hot", "Mild",
                    "Mild", "Overcast", "Hot", "Hot"],
    "Humidity": ["High", "High", "High", "High", "Normal", "Normal", "Normal", "Normal",
                 "High", "High", "High", "Normal", "High"],
    "Windy": ["FALSE", "TRUE", "FALSE", "FALSE", "FALSE", "TRUE", "TRUE",
              "FALSE", "FALSE", "TRUE", "TRUE", "FALSE", "TRUE"],
    "PlayTennis": ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "No", "Yes",
                   "Yes", "Yes"]
}
df = pd.DataFrame(data)


attribute_names = df.columns.tolist()
attribute_names.remove("PlayTennis")


factor_mappings = {}
for colname in df.select_dtypes("object"):
    df[colname], mapping = df[colname].factorize()
    factor_mappings[colname] = mapping


print("Factorized dataset:")
print(df)


tree = id3(df, "PlayTennis", attribute_names)


print("The tree structure:")
pprint(tree)


new_sample = {
    "Outlook": "Sunny",
    "Temperature": "Hot",
    "Humidity": "High",
    "Windy": "FALSE"
}


for colname in new_sample:
    new_sample[colname] = factor_mappings[colname].tolist().index(new_sample[colname])


classification = classify(tree, new_sample)
print(f"The classification for the new sample is: {'Yes' if classification == 1 else 'No' if classification == 0 else 'Unknown'}")

Factorized dataset:
    Outlook  Temperature  Humidity  Windy  PlayTennis
0         0            0         0      0           0
1         0            0         0      1           0
2         1            0         0      0           1
3         2            1         0      0           1
4         2            1         1      0           1
5         2            1         1      1           0
6         1            1         1      1           1
7         0            0         1      0           0
8         0            1         0      0           1
9         2            1         0      1           0
10        0            2         0      1           1
11        1            0         1      0           1
12        1            0         0      1           1
The tree structure:
{'Outlook': {0: {'Temperature': {0: 0, 1: 1, 2: 1}},
             1: 1,
             2: {'Windy': {0: 1, 1: 0}}}}
The classification for the new sample is: No
