In [10]:
import numpy as np
from scipy.spatial import distance

# Define the data objects
p1 = np.array([7, 3])
p2 = np.array([4, 9])
p3 = np.array([1, 5])
p4 = np.array([3, 2])

# (1) Calculate the Manhattan distance between P1 and P2
md_p1_p2 = np.sum(np.abs(p1 - p2))
print(f"Manhattan distance between P1 and P2: {md_p1_p2}")

# (2) Calculate the Euclidean distance between P2 and P3
ed_p2_p3 = np.linalg.norm(p2 - p3)
print(f"Euclidean distance between P2 and P3: {ed_p2_p3}")

# (3) Calculate the Cosine similarity between P1 and P3
cs_p1_p3 = 1 - distance.cosine(p1, p3)
print(f"Cosine similarity between P1 and P3: {cs_p1_p3}")

# (4) Normalize the data objects
d = np.array([p1, p2, p3, p4])

# Normalize the first attribute by Z-score technique
m_fa = np.mean(d[:, 0])
s_fa = np.std(d[:, 0])
d[:, 0] = (d[:, 0] - m_fa) / s_fa

# Normalize the second attribute by Min-max technique to interval [0, 1]
mi_sa = np.min(d[:, 1])
ma_sa = np.max(d[:, 1])
d[:, 1] = (d[:, 1] - mi_sa) / (ma_sa - mi_sa)

print("Normalized data objects:")
print(d)

Manhattan distance between P1 and P2: 9
Euclidean distance between P2 and P3: 5.0
Cosine similarity between P1 and P3: 0.5665288228870652
Normalized data objects:
[[ 1  0]
 [ 0  1]
 [-1  0]
 [ 0  0]]


In [9]:
import pandas as pd
import numpy as np

# Build the dataset
d = {
    'X': [0, 0, 2, 2, 0, 1, 1, 1, 1],
    'Y': [1, 1, 0, 0, 0, 0, 0, 1, 1],
    'Z': [1, 0, 1, 2, 0, 1, 0, 1, 0],
    'dog': [10, 15, 30, 25, 5, 5, 0, 25, 20],
    'cat': [20, 15, 5, 0, 15, 20, 15, 10, 5]
}

df = pd.DataFrame(d)
df['total'] = df['dog'] + df['cat']

# Calculate the Gini impurity of the root node
td = df['dog'].sum()
tc = df['cat'].sum()
t = td + tc
pd_ = td / t
pc_ = tc / t
gr = 1 - (pd_**2 + pc_**2)
print(f"Root node Gini impurity: {gr:.4f}")

# Function to calculate Gini impurity
def gi(dog, cat):
    t = dog + cat
    if t == 0:
        return 0
    pd_ = dog / t
    pc_ = cat / t
    return 1 - (pd_**2 + pc_**2)

# (2) Split by attribute X and calculate collective Gini impurity of child nodes
gx = df.groupby('X').sum()
gx_g = 0
for i, r in gx.iterrows():
    g = gi(r['dog'], r['cat'])
    gx_g += (r['total'] / t) * g
    print(f"X={i} Gini impurity: {g:.4f}")
print(f"Weighted Gini impurity for attribute X: {gx_g:.4f}")

# (3) Split by attribute Y and calculate collective Gini impurity of child nodes
gy = df.groupby('Y').sum()
gy_g = 0
for i, r in gy.iterrows():
    g = gi(r['dog'], r['cat'])
    gy_g += (r['total'] / t) * g
    print(f"Y={i} Gini impurity: {g:.4f}")
print(f"Weighted Gini impurity for attribute Y: {gy_g:.4f}")

# (4) Calculate Information Gain and choose the best split attribute
igx = gr - gx_g
igy = gr - gy_g
print(f"Information Gain IG(X): {igx:.4f}")
print(f"Information Gain IG(Y): {igy:.4f}")
bs = 'X' if igx > igy else 'Y'
print(f"Best split attribute: {bs}")

# (5) Build a two-level decision tree and mark class labels in each leaf node
print("\nDecision Tree Structure:")
print("Root node: X")
print("├── X=0: Y")
print("│   ├── Y=0: cat")
print("│   └── Y=1: cat")
print("├── X=1: Y")
print("│   ├── Y=0: cat")
print("│   └── Y=1: dog")
print("└── X=2: dog")

# (6) Calculate the confusion matrix and evaluation metrics for the "dog" class
# Sample prediction based on the decision tree
def p(r):
    if r['X'] == 0:
        return 'cat'
    elif r['X'] == 1:
        if r['Y'] == 1:
            return 'dog'
        else:
            return 'cat'
    elif r['X'] == 2:
        return 'dog'

df['pred'] = df.apply(p, axis=1)

# Calculate confusion matrix components
tp = df[(df['pred'] == 'dog') & (df['dog'] > 0)]['dog'].sum()
fp = df[(df['pred'] == 'dog') & (df['cat'] > 0)]['cat'].sum()
fn = df[(df['pred'] == 'cat') & (df['dog'] > 0)]['dog'].sum()
tn = df[(df['pred'] == 'cat') & (df['cat'] > 0)]['cat'].sum()

print("\nConfusion Matrix:")
print(f"TP (True Positives) = {tp}")
print(f"FP (False Positives) = {fp}")
print(f"FN (False Negatives) = {fn}")
print(f"TN (True Negatives) = {tn}")

# Calculate evaluation metrics
acc = (tp + tn) / t
prec = tp / (tp + fp) if (tp + fp) != 0 else 0
rec = tp / (tp + fn) if (tp + fn) != 0 else 0
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) != 0 else 0

print(f"\nAccuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-Score: {f1:.4f}")

Root node Gini impurity: 0.4922
X=0 Gini impurity: 0.4688
X=1 Gini impurity: 0.5000
X=2 Gini impurity: 0.1528
Weighted Gini impurity for attribute X: 0.4028
Y=0 Gini impurity: 0.4965
Y=1 Gini impurity: 0.4861
Weighted Gini impurity for attribute Y: 0.4913
Information Gain IG(X): 0.0894
Information Gain IG(Y): 0.0009
Best split attribute: X

Decision Tree Structure:
Root node: X
├── X=0: Y
│   ├── Y=0: cat
│   └── Y=1: cat
├── X=1: Y
│   ├── Y=0: cat
│   └── Y=1: dog
└── X=2: dog

Confusion Matrix:
TP (True Positives) = 100
FP (False Positives) = 20
FN (False Negatives) = 35
TN (True Negatives) = 85

Accuracy: 0.7708
Precision: 0.8333
Recall: 0.7407
F1-Score: 0.7843
