In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, fetch_openml
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, KFold
import warnings

# Ignore any warnings for clean output
warnings.filterwarnings("ignore")

# -------------------------------
# 1. Experiment on the Iris Dataset
# -------------------------------

# Load the Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Define the nmin values (as percentages) for the Iris dataset
# These values will be converted to fractions for scikit-learn’s min_samples_leaf parameter.
iris_nmin_values = [5, 10, 15, 20]

# Set up 10-fold cross-validation with shuffling for reproducibility
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Store results in a list of dictionaries
iris_results = []

for nmin in iris_nmin_values:
    # Convert percentage to fraction.
    # scikit-learn will use: ceil(nmin_fraction * n_samples) as the minimum number of samples per leaf.
    min_samples_leaf = nmin / 100.0
    
    # Initialize the Decision Tree classifier with the early stopping parameter.
    clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, random_state=42)
    
    # Perform 10-fold cross-validation
    scores = cross_val_score(clf, X_iris, y_iris, cv=cv, scoring='accuracy')
    
    # Append the mean and standard deviation of the accuracy scores
    iris_results.append({
        'nmin (%)': nmin,
        'Mean Accuracy': np.mean(scores),
        'Std Accuracy': np.std(scores)
    })

# Convert the results into a DataFrame for a neat table
iris_results_df = pd.DataFrame(iris_results)
print("Iris Dataset Results:")
print(iris_results_df)

# -------------------------------
# 2. Experiment on the Spambase Dataset
# -------------------------------

# Load the Spambase dataset from OpenML.
# Note: The 'spambase' dataset is available on OpenML. If this fails, ensure you have an internet connection.
spambase = fetch_openml('spambase', version=1, as_frame=True)
X_spambase = spambase.data
y_spambase = spambase.target

# Convert the target to integer labels (the target might be loaded as strings)
y_spambase = y_spambase.astype(int)

# Define the nmin values (as percentages) for the Spambase dataset
spambase_nmin_values = [5, 10, 15, 20, 25]

spambase_results = []

for nmin in spambase_nmin_values:
    min_samples_leaf = nmin / 100.0
    clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, random_state=42)
    scores = cross_val_score(clf, X_spambase, y_spambase, cv=cv, scoring='accuracy')
    spambase_results.append({
        'nmin (%)': nmin,
        'Mean Accuracy': np.mean(scores),
        'Std Accuracy': np.std(scores)
    })

spambase_results_df = pd.DataFrame(spambase_results)
print("\nSpambase Dataset Results:")
print(spambase_results_df)
