In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

#  Data Loading and Exploration
def load_and_explore_data(file_path=None):
    """
    Load the iris dataset and perform initial exploration
    """
    try:
        if file_path:
            # Try to load the provided file
            df = pd.read_csv('Downloads/IRIS.csv')
            print(f"Dataset loaded successfully from {file_path}")
        else:
            # Use the built-in iris dataset from seaborn
            print("No file path provided. Using the built-in iris dataset...")
            df = sns.load_dataset('iris')
            
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Creating a sample iris dataset for testing...")
        
        # Create a sample iris dataset with the correct column names
        data = {
            'sepal_length': [5.1, 4.9, 4.7, 7.0, 6.4, 6.9, 6.3, 5.8, 7.1],
            'sepal_width': [3.5, 3.0, 3.2, 3.2, 3.2, 3.1, 2.5, 2.7, 3.0],
            'petal_length': [1.4, 1.4, 1.3, 4.7, 4.5, 4.9, 6.0, 5.1, 5.9],
            'petal_width': [0.2, 0.2, 0.2, 1.4, 1.5, 1.5, 2.5, 1.9, 2.1],
            'species': ['setosa', 'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica', 'virginica', 'virginica']
        }
        df = pd.DataFrame(data)
    
    print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")
    
    print("\nFirst 5 records:")
    print(df.head())
    
    print("\nData Information:")
    print(df.info())
    
    print("\nStatistical Summary:")
    print(df.describe())
    
    print("\nClass Distribution:")
    print(df['species'].value_counts())
    
    print("\nChecking for missing values:")
    print(df.isnull().sum())
    
    return df

#  Data Visualization
def visualize_data(df):
    """
    Create visualizations to understand the dataset better
    """
    # Set up the matplotlib figure size
    plt.figure(figsize=(12, 10))
    
    # 2.1 Create a pairplot to visualize relationships between features by species
    print("Creating pairplot of features...")
    pairplot = sns.pairplot(df, hue='species', height=2.5)
    pairplot.fig.suptitle('Pairwise Relationships between Features', y=1.02)
    plt.savefig('iris_pairplot.png')
    plt.close()
    
    # 2.2 Create boxplots for each feature by species
    plt.figure(figsize=(12, 8))
    plt.suptitle('Feature Distributions by Species', y=0.95, fontsize=16)
    
    features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    
    for i, feature in enumerate(features):
        plt.subplot(2, 2, i+1)
        sns.boxplot(x='species', y=feature, data=df)
        plt.title(f'{feature.replace("_", " ").title()} by Species')
    
    plt.tight_layout()
    plt.savefig('iris_boxplots.png')
    plt.close()
    
    # 2.3 Create a correlation heatmap
    plt.figure(figsize=(10, 8))
    numeric_df = df.select_dtypes(include=[np.number])
    correlation = numeric_df.corr()
    
    sns.heatmap(correlation, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Heatmap of Features', fontsize=16)
    plt.tight_layout()
    plt.savefig('iris_correlation.png')
    plt.close()
    
    print("Visualizations saved as PNG files.")

#  Data Preprocessing
def preprocess_data(df):
    """
    Preprocess the data for modeling
    """
    # 3.1 Create features and target variables
    X = df.drop('species', axis=1)
    y = df['species']
    
    # 3.2 Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set size: {X_train.shape[0]} samples")
    print(f"Testing set size: {X_test.shape[0]} samples")
    
    return X_train, X_test, y_train, y_test, X, y

#  Model Building and Evaluation
def build_and_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Build and evaluate multiple classification models
    """
    # 4.1 Define the models to evaluate
    models = {
        'Logistic Regression': LogisticRegression(max_iter=200, random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'SVM': SVC(random_state=42),
        'K-Nearest Neighbors': KNeighborsClassifier()
    }
    
    #  Created a dictionary to store the results
    results = {}
    best_accuracy = 0
    best_model_name = None
    best_model = None
    
    # 4.3 Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Create a pipeline with scaling
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        # Train the model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        # Run cross-validation
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
        
        # Print results
        print(f"{name} Results:")
        print(f"  Test Accuracy: {accuracy:.4f}")
        print(f"  Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Create confusion matrix
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=sorted(df['species'].unique()),
                    yticklabels=sorted(df['species'].unique()))
        plt.title(f'Confusion Matrix - {name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'confusion_matrix_{name.replace(" ", "_").lower()}.png')
        plt.close()
        
        # Store the results
        results[name] = {
            'model': pipeline,
            'accuracy': accuracy,
            'cv_scores': cv_scores,
            'predictions': y_pred
        }
        
        # Keep track of the best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = name
            best_model = pipeline
    
    print(f"\nBest model: {best_model_name} with accuracy: {best_accuracy:.4f}")
    return results, best_model_name, best_model

#  Hyperparameter Tuning
def tune_best_model(best_model_name, X_train, y_train, X_test, y_test):
    """
    Perform hyperparameter tuning on the best model
    """
    # Define parameter grids based on the best model
    param_grids = {
        'Logistic Regression': {
            'model__C': [0.01, 0.1, 1, 10, 100],
            'model__solver': ['liblinear', 'lbfgs']
        },
        'Decision Tree': {
            'model__max_depth': [None, 5, 10, 15],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        },
        'Random Forest': {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        },
        'SVM': {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__gamma': ['scale', 'auto', 0.1, 1]
        },
        'K-Nearest Neighbors': {
            'model__n_neighbors': [3, 5, 7, 9],
            'model__weights': ['uniform', 'distance'],
            'model__metric': ['euclidean', 'manhattan']
        }
    }
    
    # Check if the best model has a parameter grid
    if best_model_name in param_grids:
        print(f"\nPerforming hyperparameter tuning for {best_model_name}...")
        
        # Get the base pipeline structure from the best model
        if best_model_name == 'Logistic Regression':
            model = LogisticRegression(random_state=42)
        elif best_model_name == 'Decision Tree':
            model = DecisionTreeClassifier(random_state=42)
        elif best_model_name == 'Random Forest':
            model = RandomForestClassifier(random_state=42)
        elif best_model_name == 'SVM':
            model = SVC(random_state=42)
        elif best_model_name == 'K-Nearest Neighbors':
            model = KNeighborsClassifier()
        
        # Create a new pipeline for grid search
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        
        # Set up grid search
        grid_search = GridSearchCV(
            pipeline,
            param_grids[best_model_name],
            cv=5,
            scoring='accuracy',
            n_jobs=-1
        )
        
        # Fit grid search
        grid_search.fit(X_train, y_train)
        
        # Get best parameters and score
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
        
        # Evaluate the tuned model
        tuned_model = grid_search.best_estimator_
        tuned_y_pred = tuned_model.predict(X_test)
        tuned_accuracy = accuracy_score(y_test, tuned_y_pred)
        
        print(f"Tuned model test accuracy: {tuned_accuracy:.4f}")
        print("\nTuned Model Classification Report:")
        print(classification_report(y_test, tuned_y_pred))
        
        # Create confusion matrix for tuned model
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(y_test, tuned_y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=sorted(df['species'].unique()),
                    yticklabels=sorted(df['species'].unique()))
        plt.title(f'Confusion Matrix - Tuned {best_model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'confusion_matrix_tuned_{best_model_name.replace(" ", "_").lower()}.png')
        plt.close()
        
        return tuned_model, tuned_accuracy
    else:
        print(f"No parameter grid defined for {best_model_name}. Skipping hyperparameter tuning.")
        return None, None

#  Feature Importance Analysis
def analyze_feature_importance(df, best_model, best_model_name):
    """
    Analyze and visualize feature importance
    """
    feature_names = df.drop('species', axis=1).columns
    
    # Different models have different ways to access feature importance
    if best_model_name == 'Logistic Regression':
        # For multiclass, coefficients are n_classes * n_features
        # We take the absolute mean across classes
        try:
            coefficients = best_model.named_steps['model'].coef_
            importance = np.abs(coefficients).mean(axis=0)
            
            # Create a dataframe for better visualization
            feature_importance = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importance
            }).sort_values('Importance', ascending=False)
            
            print("\nFeature Importance (Logistic Regression):")
            print(feature_importance)
        except:
            print("Could not extract feature importance from Logistic Regression model.")
            return None
            
    elif best_model_name in ['Decision Tree', 'Random Forest']:
        try:
            importance = best_model.named_steps['model'].feature_importances_
            
            # Create a dataframe for better visualization
            feature_importance = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importance
            }).sort_values('Importance', ascending=False)
            
            print(f"\nFeature Importance ({best_model_name}):")
            print(feature_importance)
        except:
            print(f"Could not extract feature importance from {best_model_name} model.")
            return None
    else:
        # For models without direct feature importance
        print(f"\n{best_model_name} doesn't provide direct feature importance.")
        print("Using permutation importance instead...")
        
        try:
            from sklearn.inspection import permutation_importance
            
            X = df.drop('species', axis=1)
            y = df['species']
            
            # Calculate permutation importance
            result = permutation_importance(best_model, X, y, n_repeats=10, random_state=42)
            importance = result.importances_mean
            
            # Create a dataframe for better visualization
            feature_importance = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importance
            }).sort_values('Importance', ascending=False)
            
            print("Permutation Feature Importance:")
            print(feature_importance)
        except Exception as e:
            print(f"Error calculating permutation importance: {e}")
            return None
    
    # Plot feature importance
    try:
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=feature_importance)
        plt.title(f'Feature Importance - {best_model_name}')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()
        
        return feature_importance
    except Exception as e:
        print(f"Error plotting feature importance: {e}")
        return None

# Save the Model
def save_model(model, filename='iris_classification_model.pkl'):
    """
    Save the trained model to a file
    """
    import pickle
    
    try:
        with open(filename, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model saved successfully to {filename}")
        return True
    except Exception as e:
        print(f"Error saving model: {str(e)}")
        return False

# 8. Create a prediction function
def predict_species(model, sepal_length, sepal_width, petal_length, petal_width):
    """
    Make a prediction using the trained model
    """
    # Create a DataFrame with the input features
    input_data = pd.DataFrame({
        'sepal_length': [sepal_length],
        'sepal_width': [sepal_width],
        'petal_length': [petal_length],
        'petal_width': [petal_width]
    })
    
    # Make prediction
    species = model.predict(input_data)[0]
    
    # Initialize prob_dict to None by default
    prob_dict = None
    
    # If the model provides probability estimates, get them
    if hasattr(model, 'predict_proba'):
        try:
            probabilities = model.predict_proba(input_data)[0]
            # Get class labels to match with probabilities
            class_labels = model.classes_
            prob_dict = {class_labels[i]: probabilities[i] for i in range(len(class_labels))}
        except:
            # prob_dict remains None if there's an exception
            pass
    
    return species, prob_dict

# Main function
def main():
    """
    Run the full iris classification pipeline
    """
    print("="*60)
    print("Iris Flower Species Classification")
    print("="*60)
    
    # Ask for file path
    file_path = input("Enter the path to your iris dataset CSV file (or press Enter to use sample data): ")
    if not file_path.strip():
        file_path = None
    
    # 1. Load and explore data
    global df  # Make df accessible to all functions
    df = load_and_explore_data(file_path)
    
    # 2. Visualize data
    visualize_data(df)
    
    # 3. Preprocess data
    X_train, X_test, y_train, y_test, X, y = preprocess_data(df)
    
    # 4. Build and evaluate models
    results, best_model_name, best_model = build_and_evaluate_models(X_train, X_test, y_train, y_test)
    
    # 5. Tune the best model
    tuned_model, tuned_accuracy = tune_best_model(best_model_name, X_train, y_train, X_test, y_test)
    
    # Use the tuned model if it's better, otherwise use the original best model
    final_model = tuned_model if tuned_model and tuned_accuracy > results[best_model_name]['accuracy'] else best_model
    final_model_name = f"Tuned {best_model_name}" if tuned_model and tuned_accuracy > results[best_model_name]['accuracy'] else best_model_name
    
    # 6. Analyze feature importance
    feature_importance = analyze_feature_importance(df, final_model, best_model_name)
    
    # 7. Save the model
    save_model(final_model)
    
    # 8. Test the prediction function
    print("\nTesting prediction function with sample data:")
    # Sample for Setosa
    setosa_sample = [5.1, 3.5, 1.4, 0.2]
    # Sample for Versicolor
    versicolor_sample = [6.0, 2.9, 4.5, 1.5]
    # Sample for Virginica
    virginica_sample = [6.7, 3.1, 5.6, 2.4]
    
    for sample in [setosa_sample, versicolor_sample, virginica_sample]:
        species, probs = predict_species(final_model, *sample)
        print(f"\nFor measurements: sepal_length={sample[0]}, sepal_width={sample[1]}, petal_length={sample[2]}, petal_width={sample[3]}")
        print(f"Predicted species: {species}")
        if probs:
            print("Prediction probabilities:")
            for species_name, prob in probs.items():
                print(f"  - {species_name}: {prob:.4f}")
    
    print("\nIris classification workflow completed successfully!")

if __name__ == "__main__":
    main()

Iris Flower Species Classification


Enter the path to your iris dataset CSV file (or press Enter to use sample data):  


No file path provided. Using the built-in iris dataset...
Dataset loaded with 150 rows and 5 columns

First 5 records:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None

Statistical Sum

<Figure size 1200x1000 with 0 Axes>