# Iris Classification Pipeline
## End-to-End ML Pipeline with Decision Tree and Logistic Regression

This notebook demonstrates:
1. Data loading and exploration
2. Data cleaning and preprocessing
3. Model training (Decision Tree & Logistic Regression)
4. Model evaluation
5. Model export using joblib

## 1. Install Required Libraries

In [None]:
!pip install scikit-learn pandas numpy matplotlib seaborn joblib

## 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    confusion_matrix,
    classification_report
)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 3. Load and Explore Data

In [None]:
# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
df.info()
print("\nStatistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Check class distribution
print("\nClass distribution:")
print(df['species'].value_counts())

## 4. Data Visualization

In [None]:
# Pairplot to visualize relationships
sns.pairplot(df, hue='species', markers=['o', 's', 'D'])
plt.suptitle('Iris Dataset - Feature Relationships', y=1.02)
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df[iris.feature_names].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.show()

## 5. Data Cleaning and Preprocessing

In [None]:
# Since Iris dataset is clean, we'll demonstrate the cleaning process

# 1. Handle missing values (none in this dataset, but showing the approach)
df_cleaned = df.copy()

# 2. Remove duplicates if any
print(f"Duplicates before cleaning: {df_cleaned.duplicated().sum()}")
df_cleaned = df_cleaned.drop_duplicates()
print(f"Duplicates after cleaning: {df_cleaned.duplicated().sum()}")

# 3. Handle outliers using IQR method (optional for Iris dataset)
def remove_outliers_iqr(df, columns):
    df_out = df.copy()
    for col in columns:
        Q1 = df_out[col].quantile(0.25)
        Q3 = df_out[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_out = df_out[(df_out[col] >= lower_bound) & (df_out[col] <= upper_bound)]
    return df_out

# For demonstration, we'll keep all data
print(f"\nRows before outlier removal: {len(df_cleaned)}")
# Uncomment to remove outliers:
# df_cleaned = remove_outliers_iqr(df_cleaned, iris.feature_names)
print(f"Rows after cleaning: {len(df_cleaned)}")

## 6. Prepare Data for Training

In [None]:
# Separate features and target
X = df_cleaned[iris.feature_names].values
y = df_cleaned['target'].values

# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts().sort_index())

In [None]:
# Feature scaling (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")
print(f"\nOriginal feature means: {X_train.mean(axis=0)}")
print(f"Scaled feature means: {X_train_scaled.mean(axis=0)}")

## 7. Train Decision Tree Classifier

In [None]:
# Train Decision Tree (doesn't require scaling, but we'll use original data)
dt_model = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

dt_model.fit(X_train, y_train)
print("Decision Tree trained successfully!")
print(f"Tree depth: {dt_model.get_depth()}")
print(f"Number of leaves: {dt_model.get_n_leaves()}")

## 8. Train Logistic Regression Classifier

In [None]:
# Train Logistic Regression (uses scaled data)
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    multi_class='ovr'  # One-vs-Rest for multiclass
)

lr_model.fit(X_train_scaled, y_train)
print("Logistic Regression trained successfully!")
print(f"Number of iterations: {lr_model.n_iter_}")

## 9. Model Evaluation

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Comprehensive model evaluation"""
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Metrics
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    print(f"\n{'='*50}")
    print(f"{model_name} Evaluation")
    print(f"{'='*50}")
    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"\nCross-Validation Scores: {cv_scores}")
    print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Classification report
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_test_pred, target_names=iris.target_names))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=iris.target_names, 
                yticklabels=iris.target_names)
    plt.title(f'{model_name} - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return {
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }

In [None]:
# Evaluate Decision Tree
dt_metrics = evaluate_model(dt_model, X_train, X_test, y_train, y_test, "Decision Tree")

In [None]:
# Evaluate Logistic Regression
lr_metrics = evaluate_model(lr_model, X_train_scaled, X_test_scaled, y_train, y_test, "Logistic Regression")

## 10. Model Comparison

In [None]:
# Compare models
comparison_df = pd.DataFrame({
    'Decision Tree': dt_metrics,
    'Logistic Regression': lr_metrics
}).T

print("\nModel Comparison:")
print(comparison_df)

# Visualize comparison
comparison_df[['test_accuracy', 'precision', 'recall', 'f1_score']].plot(kind='bar', figsize=(12, 6))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xlabel('Model')
plt.legend(loc='lower right')
plt.xticks(rotation=0)
plt.ylim(0.9, 1.0)
plt.grid(axis='y', alpha=0.3)
plt.show()

## 11. Export Models and Metadata

In [None]:
# Create directory for models
import os
os.makedirs('models', exist_ok=True)

# Export Decision Tree
joblib.dump(dt_model, 'models/decision_tree_model.pkl')
print("✓ Decision Tree model saved to 'models/decision_tree_model.pkl'")

# Export Logistic Regression
joblib.dump(lr_model, 'models/logistic_regression_model.pkl')
print("✓ Logistic Regression model saved to 'models/logistic_regression_model.pkl'")

# Export Scaler (needed for Logistic Regression)
joblib.dump(scaler, 'models/scaler.pkl')
print("✓ Scaler saved to 'models/scaler.pkl'")

In [None]:
# Save model metadata
metadata = {
    'feature_names': iris.feature_names,
    'target_names': iris.target_names.tolist(),
    'n_features': len(iris.feature_names),
    'n_classes': len(iris.target_names),
    'models': {
        'decision_tree': {
            'file': 'decision_tree_model.pkl',
            'requires_scaling': False,
            'metrics': dt_metrics
        },
        'logistic_regression': {
            'file': 'logistic_regression_model.pkl',
            'requires_scaling': True,
            'scaler_file': 'scaler.pkl',
            'metrics': lr_metrics
        }
    }
}

with open('models/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("✓ Metadata saved to 'models/metadata.json'")

## 12. Test Loaded Models

In [None]:
# Load models back to verify they work
loaded_dt = joblib.load('models/decision_tree_model.pkl')
loaded_lr = joblib.load('models/logistic_regression_model.pkl')
loaded_scaler = joblib.load('models/scaler.pkl')

# Test with a sample
sample_input = X_test[0].reshape(1, -1)
true_label = y_test[0]

print("Sample input:", sample_input[0])
print(f"True label: {iris.target_names[true_label]}")
print()

# Decision Tree prediction
dt_pred = loaded_dt.predict(sample_input)[0]
dt_proba = loaded_dt.predict_proba(sample_input)[0]
print(f"Decision Tree Prediction: {iris.target_names[dt_pred]}")
print(f"Probabilities: {dict(zip(iris.target_names, dt_proba))}")
print()

# Logistic Regression prediction
sample_scaled = loaded_scaler.transform(sample_input)
lr_pred = loaded_lr.predict(sample_scaled)[0]
lr_proba = loaded_lr.predict_proba(sample_scaled)[0]
print(f"Logistic Regression Prediction: {iris.target_names[lr_pred]}")
print(f"Probabilities: {dict(zip(iris.target_names, lr_proba))}")

print("\n✓ Models loaded and tested successfully!")

## Summary

This notebook demonstrated a complete end-to-end machine learning pipeline:

1. ✓ Data loading and exploration
2. ✓ Data cleaning and preprocessing
3. ✓ Feature scaling
4. ✓ Model training (Decision Tree & Logistic Regression)
5. ✓ Comprehensive evaluation with multiple metrics
6. ✓ Model comparison and visualization
7. ✓ Model export using joblib
8. ✓ Model verification

The exported models are ready to be integrated into a web application!