# Project 1: ML Pipeline - Iris Classifier

> **AI Solutions Architect Portfolio** | Day 1, Session 2
>
> A complete ML pipeline demonstrating: data loading, exploration, preparation, training, evaluation, and explainability.

In [None]:
# Cell 1: Import Libraries
# Every ML project starts with these
import pandas as pd                  # Data manipulation (think: spreadsheets in code)
import numpy as np                   # Math operations (think: calculator on steroids)
import matplotlib.pyplot as plt      # Visualization (think: chart maker)
import seaborn as sns                # Pretty visualizations (think: chart maker, but prettier)

from sklearn.datasets import load_iris          # Our dataset
from sklearn.model_selection import train_test_split  # Split data for training/testing
from sklearn.preprocessing import StandardScaler      # Normalize features
from sklearn.ensemble import RandomForestClassifier   # Our model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Libraries loaded. Ready to build.")

In [None]:
# Cell 2: Load and Explore the Data
# Load the Iris dataset - 150 flowers, 4 measurements each, 3 species
iris = load_iris()

# Convert to a DataFrame (like a spreadsheet)
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target  # Add the label (0, 1, or 2 = three species)
df['species_name'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# What does our data look like?
print(f"Dataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"\nFeatures (inputs): {iris.feature_names}")
print(f"Labels (outputs): {iris.target_names}")
print(f"\nFirst 5 rows:")
df.head()

In [None]:
# Cell 3: Visualize the Data (Architect's Eye)
# As a Solutions Architect, you need to understand the data BEFORE modeling
# This tells you: are the classes separable? Are features useful?

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Iris Dataset - Feature Distributions by Species', fontsize=14)

for i, feature in enumerate(iris.feature_names):
    ax = axes[i // 2, i % 2]
    for species_id, species_name in enumerate(iris.target_names):
        subset = df[df['species'] == species_id]
        ax.hist(subset[feature], alpha=0.6, label=species_name, bins=15)
    ax.set_xlabel(feature)
    ax.set_ylabel('Count')
    ax.legend()

plt.tight_layout()
plt.show()

# WHAT TO NOTICE: If the colors separate cleanly, that feature is useful.
# If they overlap completely, that feature won't help much.

In [None]:
# Cell 4: Prepare the Data (The Pipeline Begins)
# STEP 1: Separate features (X) from labels (y)
X = df[iris.feature_names]  # The 4 measurements (inputs)
y = df['species']            # The species to predict (output)

# STEP 2: Split into training and testing sets
# 80% for training, 20% for testing - the model never sees the test data during training
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42  # random_state = reproducible results
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set:  {X_test.shape[0]} samples")

# STEP 3: Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Learn scaling from training data
X_test_scaled = scaler.transform(X_test)          # Apply same scaling to test data

# WHY: fit_transform on training, just transform on testing.
# If you fit on test data too, you're leaking future information into the model.
# This is called "data leakage" - a common mistake the SA should catch.
print("\nFeatures normalized. Pipeline step 1 complete.")

In [None]:
# Cell 5: Train the Model
# Random Forest = a collection of decision trees that vote on the answer
# Think of it as a panel of experts. Each tree sees a random subset of data,
# makes its own decision, and the majority vote wins.

model = RandomForestClassifier(
    n_estimators=100,    # 100 decision trees in our forest
    random_state=42      # Reproducible results
)

# Train the model - this is where it learns patterns
model.fit(X_train_scaled, y_train)

print("Model trained on 120 samples.")
print("Now let's see how it performs on the 30 samples it's never seen...")

In [None]:
# Cell 6: Evaluate Performance
# Make predictions on the test set (data the model has never seen)
y_pred = model.predict(X_test_scaled)

# How accurate?
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.1%}")
print(f"\nDetailed Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# Confusion Matrix - shows exactly where the model gets confused
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names,
            yticklabels=iris.target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Where Does the Model Get It Wrong?')
plt.show()

# ARCHITECT'S QUESTION: Is this accuracy good enough for the business problem?
# 100% on Iris is easy. Real-world problems are messier.

In [None]:
# Cell 7: Feature Importance (The Explainability Layer)
# Random Forests can tell us which features mattered most

importance = model.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': importance
}).sort_values('Importance', ascending=True)

plt.figure(figsize=(8, 5))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='steelblue')
plt.xlabel('Importance Score')
plt.title('Feature Importance - What Drives the Prediction?')
plt.show()

# ARCHITECT INSIGHT: If one feature dominates, the model is simple.
# If importance is spread evenly, the relationships are complex.
# This shapes your architecture decisions (latency, compute needs).

In [None]:
# Cell 8: The Full Pipeline Diagram
# Let's visualize what we just built - this is the architecture artifact

pipeline_text = """
======================================================================
                   ML PIPELINE ARCHITECTURE                      
                   Project 1: Iris Classifier                    
======================================================================
                                                                  
  +------------+   +------------+   +------------+              
  | 1. DATA    |-->| 2. PREPARE |-->| 3. SPLIT   |              
  | Load Iris  |   | Clean &    |   | 80% Train  |              
  | 150 rows   |   | Explore    |   | 20% Test   |              
  | 4 features |   | Visualize  |   |            |              
  +------------+   +------------+   +-----+------+              
                                          |                      
                              +-----------+-----------+          
                              v                       v          
                    +------------+           +------------+      
                    | 4. SCALE   |           | Hold for   |      
                    | Normalize  |           | testing    |      
                    | features   |           | (unseen)   |      
                    +-----+------+           +-----+------+      
                          v                        |              
                    +------------+                 |              
                    | 5. TRAIN   |                 |              
                    | Random     |                 |              
                    | Forest     |                 |              
                    | (100 trees)|                 |              
                    +-----+------+                 |              
                          v                        v              
                    +------------+           +------------+      
                    | 6. PREDICT |---------->| 7. EVALUATE|      
                    | on test    |           | Accuracy   |      
                    | data       |           | Confusion  |      
                    |            |           | Matrix     |      
                    +------------+           +-----+------+      
                                                   v              
                                            +------------+       
                                            | 8. EXPLAIN |       
                                            | Feature    |       
                                            | Importance |       
                                            +------------+       
                                                                  
======================================================================
"""
print(pipeline_text)