In [None]:
from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve, auc
import numpy as np
import matplotlib.pyplot as plt 
from mglearn.datasets import make_wave
import pandas as pd
import matplotlib as mpl
from sklearn.tree import DecisionTreeRegressor
import mglearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import os
from sklearn.datasets import fetch_openml

In [14]:
'''
Q1) Analyze the performance of decision tree with respect to accuracy, recall, precision, FPR, and ROC
   metrics for iris and wine datasets. Hint: the datasets can be loaded using sklearn.datasets.load
   function.
'''

def compute_metrics(loader, random_state):
    data = loader()
    X, y = data.data, data.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state
    )
    # training
    model = DecisionTreeClassifier(random_state=0)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    # metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    roc_auc = auc(fpr, tpr)
    
    return accuracy, precision, recall, fpr, roc_auc


# Compute metrics for Iris and Wine datasets
iris_metrics = compute_metrics(load_iris, random_state=42)
wine_metrics = compute_metrics(load_wine, random_state=0)

# Aggregate results into DataFrame
df = pd.DataFrame({
    'accuracy': [iris_metrics[0], wine_metrics[0]],
    'precision': [iris_metrics[1], wine_metrics[1]],
    'recall': [iris_metrics[2], wine_metrics[2]],
    'fpr': [iris_metrics[3], wine_metrics[3]],
    'roc_auc': [iris_metrics[4], wine_metrics[4]],
}, index=['iris', 'wine'])

# Round numeric metrics to 3 decimals for readability
df[['accuracy', 'precision', 'recall', 'roc_auc']] = df[['accuracy', 'precision', 'recall', 'roc_auc']].round(3)
# Convert FPR arrays to string representation without truncation
df['fpr'] = df['fpr'].apply(lambda arr: np.array2string(np.array(arr), separator=','))

# Display as a proper markdown table
print("Decision Tree Performance Metrics:\n")
print(df.to_markdown())


Decision Tree Performance Metrics:

|      |   accuracy |   precision |   recall | fpr                                           |   roc_auc |
|:-----|-----------:|------------:|---------:|:----------------------------------------------|----------:|
| iris |      1     |       1     |    1     | [0.        ,0.52380952,0.52380952,1.        ] |     0.476 |
| wine |      0.972 |       0.978 |    0.979 | [0. ,0.3,0.3,1. ]                             |     0.678 |


In [16]:
''' 
Q2) Analyze the impact on accuracy of training and testing for iris and wine datasets due to the unpruned
tree and tree depth of 3, 4, and 5.
'''

def compute_train_test_accuracy(loader, random_state, max_depth=None):
    ''' Function to compute training and testing accuracy for a given depth '''
    data = loader()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state
    )
    
    # Train Decision Tree Classifier
    model = DecisionTreeClassifier(random_state=0, max_depth=max_depth)
    model.fit(X_train, y_train)
  
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    return train_acc, test_acc

# Define depths
depths = [None, 3, 4, 5]
labels = ['unpruned'] + [str(d) for d in depths if d is not None]

# Prepare DataFrame to collect results
results = []
for name, loader, rs in [('iris', load_iris, 42), ('wine', load_wine, 0)]:
    for depth in depths:
        train_acc, test_acc = compute_train_test_accuracy(loader, rs, max_depth=depth)
        results.append({
            'dataset': name,
            'max_depth': 'unpruned' if depth is None else depth,
            'train_accuracy': round(train_acc, 3),
            'test_accuracy': round(test_acc, 3)
        })


df = pd.DataFrame(results)
# Pivot for readability
pivot_df = df.pivot(index='dataset', columns='max_depth', values=['train_accuracy', 'test_accuracy'])

# Display overall results
print("Decision Tree Train vs. Test Accuracies for Different Depths:\n")
print(pivot_df.to_markdown())


Decision Tree Train vs. Test Accuracies for Different Depths:

| dataset   |   ('train_accuracy', 3) |   ('train_accuracy', 4) |   ('train_accuracy', 5) |   ('train_accuracy', 'unpruned') |   ('test_accuracy', 3) |   ('test_accuracy', 4) |   ('test_accuracy', 5) |   ('test_accuracy', 'unpruned') |
|:----------|------------------------:|------------------------:|------------------------:|---------------------------------:|-----------------------:|-----------------------:|-----------------------:|--------------------------------:|
| iris      |                   0.958 |                   0.975 |                   0.992 |                                1 |                  1     |                  1     |                  1     |                           1     |
| wine      |                   0.993 |                   1     |                   1     |                                1 |                  0.972 |                  0.972 |                  0.972 |                           0

In [None]:

def mean_relative_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

def load_wave():
    # make_wave from mglearn.datasets
    X, y = make_wave()
    return X.reshape(-1, 1), y


def load_ram_prices():
    # Load RAM prices CSV from mglearn.datasets.DATA_PATH
    ram_prices = pd.read_csv(
        os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv")
    )
    # Convert date to ordinal number for regression
    ram_prices['date'] = pd.to_datetime(ram_prices['date'])
    X = ram_prices['date'].map(pd.Timestamp.toordinal).values.reshape(-1, 1)
    y = ram_prices['price'].values
    return X, y


def load_boston():
    # fetch_openml from sklearn.datasets
    boston = fetch_openml(name='boston', version=1, as_frame=True)
    X = boston.data.values
    y = boston.target.astype(float).values
    return X, y


datasets = {
    'wave': load_wave,
    'ram_prices': load_ram_prices,
    'boston_housing': load_boston
}

# Store results
results = []

for name, loader in datasets.items():
    
    X, y = loader()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=0
    )
    
    # Linear Regression
    lin = LinearRegression()
    lin.fit(X_train, y_train)
    y_lin = lin.predict(X_test)
    r2_lin = r2_score(y_test, y_lin)
    mre_lin = mean_relative_error(y_test, y_lin)

    # Decision Tree Regression
    tree = DecisionTreeRegressor(random_state=0)
    tree.fit(X_train, y_train)
    y_tree = tree.predict(X_test)
    r2_tree = r2_score(y_test, y_tree)
    mre_tree = mean_relative_error(y_test, y_tree)

    # Append to results
    results.append((name, 'LinearRegression', round(r2_lin, 3), round(mre_lin, 3)))
    results.append((name, 'DecisionTree',    round(r2_tree, 3), round(mre_tree, 3)))


df = pd.DataFrame(results, columns=['Dataset', 'Model', 'R2', 'MeanRelativeError'])
pivot_df = df.pivot(index='Dataset', columns='Model', values=['R2', 'MeanRelativeError'])
print("Comparison of Linear vs. Tree Regression (R2 & Mean Relative Error):\n")
print(pivot_df.to_markdown())


Comparison of Linear vs. Tree Regression (R2 & Mean Relative Error):

| Dataset        |   ('R2', 'DecisionTree') |   ('R2', 'LinearRegression') |   ('MeanRelativeError', 'DecisionTree') |   ('MeanRelativeError', 'LinearRegression') |
|:---------------|-------------------------:|-----------------------------:|----------------------------------------:|--------------------------------------------:|
| boston_housing |                    0.832 |                        0.688 |                             0.131       |                                 0.177       |
| ram_prices     |                   -0.074 |                       -0.074 |                             4.62121e+07 |                                 4.62121e+07 |
| wave           |                    0.613 |                        0.623 |                             1.097       |                                 0.809       |
