In [None]:
# Iris Species Classification using Decision Tree Classifier
# Author: ML Practitioner
# Dataset: Iris Species Dataset from scikit-learn

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Configure matplotlib for better display
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

def load_and_explore_data():
    """
    Load the Iris dataset and perform initial exploration
    """
    print("=" * 50)
    print("STEP 1: LOADING AND EXPLORING THE DATASET")
    print("=" * 50)

    # Load the Iris dataset
    iris = load_iris()

    # Create a DataFrame for easier manipulation
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['species'] = iris.target

    # Map target numbers to actual species names
    species_names = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
    df['species_name'] = df['species'].map(species_names)

    print(f"Dataset shape: {df.shape}")
    print(f"Features: {iris.feature_names}")
    print(f"Target classes: {iris.target_names}")
    print("\nFirst 5 rows:")
    print(df.head())

    print("\nDataset info:")
    print(df.info())

    print("\nClass distribution:")
    print(df['species_name'].value_counts())

    return df, iris

def preprocess_data(df):
    """
    Preprocess the data: handle missing values and encode labels
    """
    print("\n" + "=" * 50)
    print("STEP 2: DATA PREPROCESSING")
    print("=" * 50)

    # Check for missing values
    print("Missing values per column:")
    missing_values = df.isnull().sum()
    print(missing_values)

    if missing_values.sum() == 0:
        print("✓ No missing values found in the dataset!")
    else:
        print("⚠ Missing values detected. Handling missing values...")
        # For numerical features, fill with median
        numerical_cols = df.select_dtypes(include=[np.number]).columns
        for col in numerical_cols:
            if df[col].isnull().sum() > 0:
                df[col].fillna(df[col].median(), inplace=True)
                print(f"Filled missing values in {col} with median")

    # Prepare features and target
    feature_columns = ['sepal length (cm)', 'sepal width (cm)',
                      'petal length (cm)', 'petal width (cm)']
    X = df[feature_columns]
    y = df['species']  # Already encoded as 0, 1, 2

    print(f"\nFeatures shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Target classes: {np.unique(y)}")

    # Display basic statistics
    print("\nFeature statistics:")
    print(X.describe())

    return X, y

def train_decision_tree(X_train, y_train):
    """
    Train a Decision Tree Classifier
    """
    print("\n" + "=" * 50)
    print("STEP 3: TRAINING DECISION TREE CLASSIFIER")
    print("=" * 50)

    # Initialize the Decision Tree Classifier
    # Using random_state for reproducibility
    dt_classifier = DecisionTreeClassifier(
        random_state=42,
        max_depth=4,  # Prevent overfitting
        min_samples_split=5,  # Minimum samples required to split
        min_samples_leaf=2    # Minimum samples required at leaf node
    )

    print("Decision Tree parameters:")
    print(f"- Max depth: {dt_classifier.max_depth}")
    print(f"- Min samples split: {dt_classifier.min_samples_split}")
    print(f"- Min samples leaf: {dt_classifier.min_samples_leaf}")
    print(f"- Random state: {dt_classifier.random_state}")

    # Train the model
    print("\nTraining the model...")
    dt_classifier.fit(X_train, y_train)
    print("✓ Model training completed!")

    # Display feature importance
    feature_names = X_train.columns
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': dt_classifier.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nFeature Importance:")
    for idx, row in importance_df.iterrows():
        print(f"{row['feature']}: {row['importance']:.4f}")

    return dt_classifier

def evaluate_model(model, X_test, y_test, X_train, y_train):
    """
    Evaluate the model using accuracy, precision, and recall
    """
    print("\n" + "=" * 50)
    print("STEP 4: MODEL EVALUATION")
    print("=" * 50)

    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Calculate metrics for training set
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train, average='weighted')
    train_recall = recall_score(y_train, y_pred_train, average='weighted')

    # Calculate metrics for test set
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test, average='weighted')
    test_recall = recall_score(y_test, y_pred_test, average='weighted')

    print("TRAINING SET PERFORMANCE:")
    print(f"Accuracy:  {train_accuracy:.4f}")
    print(f"Precision: {train_precision:.4f}")
    print(f"Recall:    {train_recall:.4f}")

    print("\nTEST SET PERFORMANCE:")
    print(f"Accuracy:  {test_accuracy:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall:    {test_recall:.4f}")

    # Detailed classification report
    print("\nDETAILED CLASSIFICATION REPORT (Test Set):")
    target_names = ['setosa', 'versicolor', 'virginica']
    print(classification_report(y_test, y_pred_test, target_names=target_names))

    # Confusion Matrix
    print("CONFUSION MATRIX (Test Set):")
    cm = confusion_matrix(y_test, y_pred_test)
    print(cm)

    # Create a more readable confusion matrix
    cm_df = pd.DataFrame(cm,
                        index=['Actual ' + name for name in target_names],
                        columns=['Predicted ' + name for name in target_names])
    print("\nConfusion Matrix (Readable Format):")
    print(cm_df)

    return {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_precision': train_precision,
        'test_precision': test_precision,
        'train_recall': train_recall,
        'test_recall': test_recall,
        'confusion_matrix': cm,
        'y_pred_test': y_pred_test
    }

def visualize_results(model, X, results):
    """
    Create visualizations for the results
    """
    print("\n" + "=" * 50)
    print("STEP 5: VISUALIZATIONS")
    print("=" * 50)

    # Feature importance plot
    feature_names = X.columns
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=True)

    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['feature'], importance_df['importance'])
    plt.title('Feature Importance in Decision Tree')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

    # Confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    target_names = ['setosa', 'versicolor', 'virginica']
    sns.heatmap(results['confusion_matrix'],
                annot=True,
                fmt='d',
                cmap='Blues',
                xticklabels=target_names,
                yticklabels=target_names)
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.show()

    print("✓ Visualizations created!")

def visualize_decision_tree(model, feature_names, class_names):
    """
    Visualize the decision tree structure
    """
    print("\n" + "=" * 50)
    print("STEP 6: DECISION TREE VISUALIZATION")
    print("=" * 50)

    try:
        # Method 1: Graphical tree visualization
        plt.figure(figsize=(20, 12))
        plot_tree(model,
                  feature_names=feature_names,
                  class_names=class_names,
                  filled=True,
                  rounded=True,
                  fontsize=10)
        plt.title('Decision Tree Visualization', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()
        print("✓ Graphical tree visualization created!")

    except Exception as e:
        print(f"⚠ Could not create graphical visualization: {e}")
        print("This might happen in some environments. Text representation will still work.")

    # Method 2: Text-based tree representation
    print("\nTEXT REPRESENTATION OF DECISION TREE:")
    print("-" * 50)
    tree_rules = export_text(model,
                            feature_names=list(feature_names),
                            class_names=list(class_names))
    print(tree_rules)

    # Method 3: Tree structure analysis
    print("\nTREE STRUCTURE ANALYSIS:")
    print("-" * 50)
    print(f"Tree depth: {model.tree_.max_depth}")
    print(f"Number of nodes: {model.tree_.node_count}")
    print(f"Number of leaves: {model.tree_.n_leaves}")

    # Show decision path for a sample
    print("\nSAMPLE DECISION PATH:")
    print("-" * 50)
    print("Example: For a flower with features [5.1, 3.5, 1.4, 0.2]")
    # Create sample with proper feature names to avoid warning
    sample_data = [[5.1, 3.5, 1.4, 0.2]]
    sample = pd.DataFrame(sample_data, columns=feature_names)
    prediction = model.predict(sample)
    prediction_proba = model.predict_proba(sample)

    print(f"Predicted class: {class_names[prediction[0]]}")
    print("Class probabilities:")
    for i, prob in enumerate(prediction_proba[0]):
        print(f"  {class_names[i]}: {prob:.4f}")

    return tree_rules

def main():
    """
    Main function to execute the complete pipeline
    """
    print("IRIS SPECIES CLASSIFICATION USING DECISION TREE")
    print("=" * 60)

    try:
        # Step 1: Load and explore data
        df, iris_dataset = load_and_explore_data()

        # Step 2: Preprocess data
        X, y = preprocess_data(df)

        # Split the data into training and testing sets
        print("\nSplitting data into train/test sets (80/20 split)...")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        print(f"Training set size: {X_train.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")

        # Step 3: Train the model
        model = train_decision_tree(X_train, y_train)

        # Step 4: Evaluate the model
        results = evaluate_model(model, X_test, y_test, X_train, y_train)

        # Step 5: Create performance visualizations
        visualize_results(model, X, results)

        # Step 6: Visualize the decision tree structure
        target_names = ['setosa', 'versicolor', 'virginica']
        tree_rules = visualize_decision_tree(model, X.columns, target_names)

        # Final summary
        print("\n" + "=" * 60)
        print("FINAL SUMMARY")
        print("=" * 60)
        print(f"✓ Successfully trained Decision Tree Classifier")
        print(f"✓ Test Accuracy: {results['test_accuracy']:.4f}")
        print(f"✓ Test Precision: {results['test_precision']:.4f}")
        print(f"✓ Test Recall: {results['test_recall']:.4f}")

        if results['test_accuracy'] > 0.9:
            print("🎉 Excellent performance achieved!")
        elif results['test_accuracy'] > 0.8:
            print("👍 Good performance achieved!")
        else:
            print("⚠ Consider tuning hyperparameters for better performance.")

    except Exception as e:
        print(f"❌ An error occurred: {str(e)}")
        print("Please check your environment and try again.")

# Execute the main function
if __name__ == "__main__":
    main()