In [1]:
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris

# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

def evaluate_model(model_name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))
    return accuracy

def perform_kfold(model_name, model, X_train, y_train, is_stratified=False):
    k = 5
    if is_stratified:
        kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    else:
        kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    scores = []
    for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train if is_stratified else None), 1):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        accuracy = accuracy_score(y_val_fold, y_pred)
        scores.append(accuracy)
        
        print(f"{model_name} - Fold {fold} Accuracy: {accuracy:.4f}")
        # Print class distribution
        unique, counts = np.unique(y_train_fold, return_counts=True)
        print(f"Training set class distribution: {dict(zip(unique, counts))}")
        unique, counts = np.unique(y_val_fold, return_counts=True)
        print(f"Validation set class distribution: {dict(zip(unique, counts))}\n")
    
    return scores

print("=============== Simple Train-Test Split ===============")
# Simple train-test split approach
X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Logistic Regression
lr_model = LogisticRegression(max_iter=10000)
lr_accuracy = evaluate_model("Logistic Regression", lr_model, 
                           X_train_simple, X_test_simple, 
                           y_train_simple, y_test_simple)

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_accuracy = evaluate_model("Decision Tree", dt_model, 
                           X_train_simple, X_test_simple, 
                           y_train_simple, y_test_simple)

print("\n=============== Regular K-Fold Cross Validation ===============")
# Regular K-fold
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Logistic Regression with regular K-fold
print("\nLogistic Regression Results:")
lr_kfold_scores = perform_kfold("Logistic Regression", 
                               LogisticRegression(max_iter=10000), 
                               X_train, y_train, is_stratified=False)
print(f"Average Accuracy: {np.mean(lr_kfold_scores):.4f}")
print(f"Standard Deviation: {np.std(lr_kfold_scores):.4f}")

# Decision Tree with regular K-fold
print("\nDecision Tree Results:")
dt_kfold_scores = perform_kfold("Decision Tree", 
                               DecisionTreeClassifier(random_state=42), 
                               X_train, y_train, is_stratified=False)
print(f"Average Accuracy: {np.mean(dt_kfold_scores):.4f}")
print(f"Standard Deviation: {np.std(dt_kfold_scores):.4f}")

print("\n=============== Stratified K-Fold Cross Validation ===============")
# Logistic Regression with stratified K-fold
print("\nLogistic Regression Results:")
lr_stratified_scores = perform_kfold("Logistic Regression", 
                                   LogisticRegression(max_iter=10000), 
                                   X_train, y_train, is_stratified=True)
print(f"Average Accuracy: {np.mean(lr_stratified_scores):.4f}")
print(f"Standard Deviation: {np.std(lr_stratified_scores):.4f}")

# Decision Tree with stratified K-fold
print("\nDecision Tree Results:")
dt_stratified_scores = perform_kfold("Decision Tree", 
                                   DecisionTreeClassifier(random_state=42), 
                                   X_train, y_train, is_stratified=True)
print(f"Average Accuracy: {np.mean(dt_stratified_scores):.4f}")
print(f"Standard Deviation: {np.std(dt_stratified_scores):.4f}")

print("\n=============== Final Test Set Evaluation ===============")
# Final evaluation on test set for both models
lr_final = LogisticRegression(max_iter=10000)
lr_final.fit(X_train, y_train)
lr_test_accuracy = accuracy_score(y_test, lr_final.predict(X_test))
print("\nLogistic Regression Final Test Set Accuracy:", lr_test_accuracy)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_final.predict(X_test), target_names=iris.target_names))

dt_final = DecisionTreeClassifier(random_state=42)
dt_final.fit(X_train, y_train)
dt_test_accuracy = accuracy_score(y_test, dt_final.predict(X_test))
print("\nDecision Tree Final Test Set Accuracy:", dt_test_accuracy)
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, dt_final.predict(X_test), target_names=iris.target_names))

# Summary statistics
print("\n=============== Summary Statistics ===============")
print("\nSimple Train-Test Split:")
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")

print("\nRegular K-Fold Cross Validation:")
print(f"Logistic Regression - Mean: {np.mean(lr_kfold_scores):.4f}, Std: {np.std(lr_kfold_scores):.4f}")
print(f"Decision Tree - Mean: {np.mean(dt_kfold_scores):.4f}, Std: {np.std(dt_kfold_scores):.4f}")

print("\nStratified K-Fold Cross Validation:")
print(f"Logistic Regression - Mean: {np.mean(lr_stratified_scores):.4f}, Std: {np.std(lr_stratified_scores):.4f}")
print(f"Decision Tree - Mean: {np.mean(dt_stratified_scores):.4f}, Std: {np.std(dt_stratified_scores):.4f}")


Logistic Regression Results:
Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


Decision Tree Results:
Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



Logistic Regression Results:
Logistic Regression - Fold 1 Accuracy: 0.9583
Training set class distri