In [None]:
# Run this as the FIRST cell - Enhanced fix for threadpoolctl issues
import os

# Set threading environment variables BEFORE importing numpy/sklearn
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

# Additional environment variables that can help
os.environ["BLAS_NUM_THREADS"] = "1"
os.environ["LAPACK_NUM_THREADS"] = "1"

# Disable threadpool checking in scikit-learn
os.environ["SKLEARN_DISABLE_THREADPOOL_CHECK"] = "1"

from importlib import metadata
import warnings

# Suppress specific threadpoolctl warnings
warnings.filterwarnings("ignore", message=".*threadpool.*")
warnings.filterwarnings("ignore", message=".*OpenBLAS.*")

print("scikit-learn:", metadata.version("scikit-learn"))
print("threadpoolctl:", metadata.version("threadpoolctl"))

# Import required libraries
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# Import sklearn components with error handling
try:
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from sklearn.datasets import load_iris, load_digits
    print("✓ All sklearn imports successful")
except ImportError as e:
    print(f"Import error: {e}")

# Set matplotlib parameters
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['figure.dpi'] = 130

# ---- Part 1: Elbow method on Iris ----
print("\n=== Part 1: Elbow Method Analysis ===")

try:
    iris = load_iris()
    X_iris = iris.data
    
    ks = range(1, 11)
    inertias = []
    
    for k in ks:
        # Use specific parameters to avoid threading issues
        km = KMeans(
            n_clusters=k, 
            n_init=10,  # Use explicit number instead of "auto"
            random_state=42,
            algorithm='lloyd',  # Specify algorithm explicitly
            max_iter=300
        )
        km.fit(X_iris)
        inertias.append(km.inertia_)
        print(f"k={k}: inertia={km.inertia_:.2f}")
    
    # Plot the elbow curve
    plt.figure()
    plt.plot(list(ks), inertias, marker='o', linewidth=2, markersize=6)
    plt.title('The Elbow Method (Iris Dataset)')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
    plt.xticks(list(ks))
    plt.grid(True, linestyle='--', alpha=0.4)
    plt.tight_layout()
    plt.show()
    
    print("✓ Elbow method analysis completed successfully")

except Exception as e:
    print(f"Error in elbow method analysis: {e}")

# ---- Part 2: MNIST/Digits + Logistic Regression with/without PCA ----
print("\n=== Part 2: Logistic Regression Comparison ===")

def load_mnist_or_digits():
    """Load MNIST from OpenML, fallback to digits if unavailable"""
    try:
        from sklearn.datasets import fetch_openml
        print("Attempting to load MNIST from OpenML...")
        Xy = fetch_openml('mnist_784', version=1, as_frame=False)
        X = Xy.data
        y = Xy.target.astype(int)
        src = "OpenML MNIST (784 features)"
    except Exception as e:
        print(f"OpenML unavailable ({e}), using fallback dataset...")
        dg = load_digits()
        X = dg.data
        y = dg.target
        src = "Fallback: sklearn Digits (64 features)"
    return X, y, src

def run_logreg_pipeline(X, y, use_pca: bool):
    """Run logistic regression pipeline with optional PCA"""
    try:
        # Split the data
        Xtr, Xte, ytr, yte = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale the features
        scaler = StandardScaler()
        Xtr = scaler.fit_transform(Xtr)
        Xte = scaler.transform(Xte)
        
        # Apply PCA if requested
        if use_pca:
            pca = PCA(0.95, random_state=42)
            t0 = time.time()
            Xtr_t = pca.fit_transform(Xtr)
            Xte_t = pca.transform(Xte)
            pca_time = time.time() - t0
            comp = Xtr_t.shape[1]
            print(f"PCA reduced {X.shape[1]} features to {comp} components")
        else:
            Xtr_t, Xte_t = Xtr, Xte
            pca_time = 0.0
            comp = X.shape[1]
        
        # Train logistic regression (with single-threaded settings)
        clf = LogisticRegression(
            solver='lbfgs', 
            max_iter=200, 
            n_jobs=1,  # Use single thread to avoid threading issues
            multi_class='auto',
            random_state=42
        )
        
        t1 = time.time()
        clf.fit(Xtr_t, ytr)
        fit_time = time.time() - t1
        
        # Make predictions and calculate accuracy
        y_pred = clf.predict(Xte_t)
        acc = accuracy_score(yte, y_pred)
        
        return {
            "accuracy": acc, 
            "fit_time": fit_time, 
            "pca_time": pca_time, 
            "n_components": comp
        }
    
    except Exception as e:
        print(f"Error in pipeline: {e}")
        return None

try:
    # Load dataset
    X, y, source = load_mnist_or_digits()
    print(f"\nLoaded: {source}")
    print(f"Shape: {X.shape} | Classes: {len(np.unique(y))}")
    
    # Run both pipelines
    print("\nRunning without PCA...")
    res_no = run_logreg_pipeline(X, y, use_pca=False)
    
    print("Running with PCA...")
    res_pca = run_logreg_pipeline(X, y, use_pca=True)
    
    if res_no and res_pca:
        # Create comparison table
        summary = pd.DataFrame([
            {
                "Variant": "Without PCA",
                "Accuracy": res_no["accuracy"],
                "Fit Time (s)": res_no["fit_time"],
                "PCA Time (s)": 0.0,
                "Features/Components": res_no["n_components"]
            },
            {
                "Variant": "With PCA (0.95)",
                "Accuracy": res_pca["accuracy"],
                "Fit Time (s)": res_pca["fit_time"],
                "PCA Time (s)": res_pca["pca_time"],
                "Features/Components": res_pca["n_components"]
            }
        ])
        
        # Display formatted table
        styled_summary = summary.style.format({
            "Accuracy": "{:.4f}",
            "Fit Time (s)": "{:.3f}",
            "PCA Time (s)": "{:.3f}"
        })
        display(styled_summary)
        
        # Create comparison plot
        plt.figure()
        variants = ["No PCA", "With PCA"]
        times = [res_no["fit_time"], res_pca["fit_time"]]
        
        bars = plt.bar(variants, times, color=['skyblue', 'lightcoral'])
        plt.title("Logistic Regression Fit Time Comparison")
        plt.ylabel("Seconds")
        plt.grid(axis='y', alpha=0.3)
        
        # Add value labels on bars
        for bar, time_val in zip(bars, times):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
                    f'{time_val:.3f}s', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        # Print summary insights
        print(f"\n=== Summary ===")
        print(f"Dataset: {source}")
        accuracy_diff = res_pca["accuracy"] - res_no["accuracy"]
        time_diff = res_no["fit_time"] - res_pca["fit_time"]
        
        print(f"Accuracy change with PCA: {accuracy_diff:+.4f}")
        print(f"Fit time change with PCA: {time_diff:+.3f}s")
        print(f"Feature reduction: {X.shape[1]} → {res_pca['n_components']} components")
        
    else:
        print("Pipeline execution failed")
        
except Exception as e:
    print(f"Error in main execution: {e}")

print("\n✓ Analysis completed!")