## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

## Helper Functions - For plotting 

In [2]:
#to vualize the decision tree of scikit learn model
from util import visualize_tree_boundries

In [3]:
def plot2D(
    X,
    y=None,
    feature_names=['Length(cm)', 'Weight(kg)'],
    target_names=['diamondfin', 'bubblefish', 'crosstail'],
    title=None,
    vertical_boundry=0,
    horizontal_boundry=0
):
    """
    Plots a 2D scatter of two features, supporting both NumPy arrays and Pandas DataFrames.

    Parameters
    ----------
    X : np.ndarray or pd.DataFrame
        Feature matrix with at least two columns.
    y : np.ndarray, pd.Series, or None
        Target labels (optional).
    feature_names : list of str
        Names of the two features to display on axes.
    target_names : list of str
        Labels for each target class.
    title : str or None
        Plot title.
    vertical_boundry : float
        Position of vertical decision boundary.
    horizontal_boundry : float
        Position of horizontal decision boundary.
    """
    
    # Convert X to numpy array if it's a DataFrame
    if isinstance(X, pd.DataFrame):
        X = X.values

    # Convert y to numpy array if it's a Series or DataFrame
    if isinstance(y, (pd.Series, pd.DataFrame)):
        y = np.ravel(y.values)

    colors = ['red', 'blue', 'orange']
    markers = ['d', 'o', 'x']

    plt.figure(figsize=(6, 4))

    if y is None:
        plt.scatter(X[:, 0], X[:, 1])
    else:
        for i, target_name in enumerate(target_names):
            color = colors[i % len(colors)]
            marker = markers[i % len(markers)]
            plt.scatter(
                X[y == i, 0],
                X[y == i, 1],
                label=f"{target_name} - Label {i}",
                color=color,
                marker=marker
            )

    plt.xlabel(feature_names[0])
    plt.ylabel(feature_names[1])
    plt.title(title)
    plt.legend()

    # Draw vertical boundary (full height)
    plt.axvline(x=vertical_boundry, color='red', linestyle='--')

    # Draw horizontal boundary only to the right of the vertical line
    x_right = plt.xlim()[1]
    plt.hlines(y=horizontal_boundry, xmin=vertical_boundry, xmax=x_right, color='blue', linestyle='--')

    plt.xlim(left=0)
    plt.ylim(bottom=0)
    plt.tight_layout()
    plt.show()


## 1. Load Data

#### load `data.csv` file with pandas read_csv function

In [None]:
df = pd.read_csv('data.csv')

#### Asing the features (`Length(cm)` and	`Weigtht(kg)`) to variable `X` and target to variable `y`

In [None]:
X = df[['Length(cm)', 'Weight(kg)']]
y = df['Label']

In [8]:
target_names = ['diamondfin', 'bubblefish', 'crosstail']
feature_names = ['Length(cm)', 'Weight(kg)']

## Another illustration of Machine Learning Process



<div style="text-align: center;">
    <img src="https://raw.githubusercontent.com/PyDataGBC/PyData2025/refs/heads/main/Lab_Week12_DecisionTree/assets/ml_flow.webp" width="400" height="450" />
</div>

### Plot X and y scatter plot 

#### Note: you can use `plot2D` helper function

In [None]:
plot2D(X, y, feature_names, target_names, title='Fish Dataset')

## 2. Split Data into Train and Test Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 3. Design your baseline model 

### What is the accuracy of your random guessing model?

In [None]:
1/3

### Design a better base model with KNN (`n_neighbors=3`)

In [None]:
model_base = KNeighborsClassifier(n_neighbors=3)
model_base.fit(X_train, y_train)
y_pred_base = model_base.predict(X_test)
accuracy_base = accuracy_score(y_test, y_pred_base)
print(f"Baseline KNN Model Accuracy: {accuracy_base}")

**This is the model you want to beat with your Decision Tree implementation (maybe).**

# 4. Decison Tree From Scratch

<img src="https://raw.githubusercontent.com/PyDataGBC/PyData2025/refs/heads/main/Lab_Week12_DecisionTree/assets/decisiontree_visual.jpg" width="600" hight="200" /> 

### High level instructions

- ✅ Split data into train and test sets with `test_size=0.3`.
- ✅ Desing KNN classifier to desing your baseline model.
- Plot the **training data** and identify decision boundaries.

    - Use the `plot2D` helper function and decide on `length_threshold` and `weight_threshold` values by looking at the data.

- Display the training data and decision boundaries on the same plot.

    - Use the `plot2D` helper function again, and pass `length_threshold` as `vertical_boundry` and `weight_threshold` as `horizontal_boundry`.

- Write the logic of your decision tree with `if-else` statements.

In [None]:
plot2D(X_train, y_train, feature_names, target_names, title='Training Data')

In [None]:
length_threshold = 85
weight_threshold = 3.5

In [None]:
plot2D(X_train, y_train, feature_names, target_names, title='Training Data with Decision Boundaries', vertical_boundry=length_threshold, horizontal_boundry=weight_threshold)

In [None]:
def my_decision_tree(X_new):
    predictions = []
    for i in range(len(X_new)):
        length = X_new.iloc[i, 0] if isinstance(X_new, pd.DataFrame) else X_new[i, 0]
        weight = X_new.iloc[i, 1] if isinstance(X_new, pd.DataFrame) else X_new[i, 1]
        
        if length < length_threshold:
            predictions.append(0)
        else:
            if weight < weight_threshold:
                predictions.append(2)
            else:
                predictions.append(1)
    
    return np.array(predictions)

## 5. Evaluate your model

#### - Make prediction for the test data using `my_decision_tree` function
#### - `Plot2D` `X_test` once with `y_test` labels and once with `y_pred` labels 

#### Can you see difference especially in overlapping areas?

#### - Calculate the accuracy of your model using `accuracy_score` function from sklearn.metrics

In [None]:
y_pred = my_decision_tree(X_test)
plot2D(X_test, y_test, feature_names, target_names, title='Test Data - Actual Labels')
plot2D(X_test, y_pred, feature_names, target_names, title='Test Data - Predicted Labels')

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"My Decision Tree Model Accuracy: {accuracy}")

# 6. Decision Tree with Sklearn

#### Training step

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
# visualize the tree diagram
plt.figure(figsize=(16,10))
plot_tree(model, filled=True, feature_names=feature_names, class_names=target_names, fontsize=10)
plt.show()

In [None]:
# Visualize the decision boundaries
fig, ax = plt.subplots(figsize=(10, 6))
visualize_tree_boundries(model, X_train, y_train, ax=ax)

#### Testing the model

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Model Accuracy: {accuracy}")

#### Confusion Matrix: identify which labels are predicted correctly and which are not


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

#### Repeat the process above with `DecisionTreeClassifier(max_depth=2)`

In [None]:
model2 = DecisionTreeClassifier(max_depth=2)
model2.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(16,10))
plot_tree(model2, filled=True, feature_names=feature_names, class_names=target_names, fontsize=10)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
visualize_tree_boundries(model2, X_train, y_train, ax=ax)

In [None]:
y_pred2 = model2.predict(X_test)
accuracy2 = accuracy_score(y_test, y_pred2)
print(f"Decision Tree (max_depth=2) Accuracy: {accuracy2}")
confusion_matrix(y_test, y_pred2)

#### Find the optimal `max_depth` value for your model using a loop from 1 to 5

In [None]:
for depth in range(1, 6):
    model_temp = DecisionTreeClassifier(max_depth=depth)
    model_temp.fit(X_train, y_train)
    y_pred_temp = model_temp.predict(X_test)
    accuracy_temp = accuracy_score(y_test, y_pred_temp)
    print(f"max_depth={depth}, Accuracy: {accuracy_temp}")

### Practice Questions

 1.  What are the advantages and disadvantages of Decision Trees?
 2. What is the `confusion matrix` and how is it used?
 3. How do you `evaluate` the performance of a machine learning model?
 4. What is the purpose of `hyperparameter tuning`?