In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# --- 1. Load Data ---
# Simulate loading the data from the provided CSV content
filename = 'drug_200.csv'
df = pd.read_csv(filename)
# Separate features (X) and target (y)
X = df.drop('Drug', axis=1)
y = df['Drug']

# Encode the target variable (Drug) to numerical labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# --- 2. Data Preprocessing Pipeline ---
# Define column types
numerical_features = ['Age', 'Na_to_K']
categorical_features = ['Sex', 'BP', 'Cholesterol']

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# --- 3. 5-Fold Cross-Validation Setup ---
# StratifiedKFold is used to ensure each fold has the same proportion of target classes
N_SPLITS = 5
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Function to run cross-validation and print results
def run_cross_validation(model_pipeline, model_name):
    # Use cross_val_score to perform N_SPLITS-fold cross-validation
    scores = cross_val_score(model_pipeline, X, y_encoded, cv=cv, scoring='accuracy')
    mean_accuracy = scores.mean()
    std_accuracy = scores.std()
    
    print(f"\n--- {model_name} ---")
    print(f"Individual Fold Accuracies: {scores}")
    print(f"Mean Cross-Validation Accuracy ({N_SPLITS}-Fold): {mean_accuracy:.4f} (+/- {std_accuracy*2:.4f})")
    return mean_accuracy, std_accuracy

# ==============================================================================
# 1. Logistic Regression Classifiers
# ==============================================================================

print("--- Part 1: Logistic Regression Models with Regularization Comparison ---")

# A small C value means strong regularization (inverse of regularization strength lambda)
C_VAL = 0.1 # Example regularization strength

# --- Model 1.1: No Regularization ---
# penalty='none' explicitly removes regularization
# 'lbfgs' is a good solver for multiclass problems
lr_none_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty=None, 
        multi_class='multinomial', 
        solver='lbfgs', 
        random_state=42, 
        max_iter=1000
    ))
])
run_cross_validation(lr_none_pipeline, "Logistic Regression (No Regularization)")

# --- Model 1.2: L2 Regularization (Ridge) ---
# penalty='l2' is the default and corresponds to Ridge
lr_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l2', 
        C=C_VAL, 
        multi_class='multinomial', 
        solver='lbfgs', # 'lbfgs' supports L2
        random_state=42, 
        max_iter=1000
    ))
])
run_cross_validation(lr_ridge_pipeline, f"Logistic Regression (Ridge / L2, C={C_VAL})")

# --- Model 1.3: L1 Regularization (Lasso) ---
# 'liblinear' or 'saga' solvers support L1 penalty
lr_lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l1', 
        C=C_VAL, 
        multi_class='multinomial', 
        solver='saga', 
        random_state=42, 
        max_iter=1000
    ))
])
run_cross_validation(lr_lasso_pipeline, f"Logistic Regression (Lasso / L1, C={C_VAL})")

# --- Model 1.4: Elastic Net Regularization ---
# 'saga' solver is required for Elastic Net
# l1_ratio is the mixing parameter: 0 for L2, 1 for L1, 0 < ratio < 1 for Elastic Net
L1_RATIO = 0.5 # Equal mix of L1 and L2
lr_elastic_net_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='elasticnet', 
        C=C_VAL, 
        l1_ratio=L1_RATIO, 
        multi_class='multinomial', 
        solver='saga', 
        random_state=42, 
        max_iter=1000
    ))
])
run_cross_validation(lr_elastic_net_pipeline, f"Logistic Regression (Elastic Net, C={C_VAL}, l1_ratio={L1_RATIO})")

print("\n" + "="*80)

# ==============================================================================
# 2. K-Nearest Neighbor Classifiers
# ==============================================================================

print("--- Part 2: K-Nearest Neighbor (KNN) Comparison ---")

knn_results = {}
K_VALUES = [1, 3, 5]

for k in K_VALUES:
    # KNN requires scaling of numerical features, which is handled by the preprocessor
    knn_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier(n_neighbors=k))
    ])
    
    mean_acc, std_acc = run_cross_validation(knn_pipeline, f"K-Nearest Neighbor (K={k})")
    knn_results[k] = mean_acc

print("\n--- KNN Performance Summary ---")
for k, acc in knn_results.items():
    print(f"KNN (K={k}) Mean CV Accuracy: {acc:.4f}")

# Find the best K
best_k = max(knn_results, key=knn_results.get)
print(f"\nBest performing KNN model is K={best_k} with accuracy: {knn_results[best_k]:.4f}")
print("="*80)

--- Part 1: Logistic Regression Models with Regularization Comparison ---





--- Logistic Regression (No Regularization) ---
Individual Fold Accuracies: [0.975 1.    0.925 0.925 0.9  ]
Mean Cross-Validation Accuracy (5-Fold): 0.9450 (+/- 0.0735)

--- Logistic Regression (Ridge / L2, C=0.1) ---
Individual Fold Accuracies: [0.875 0.85  0.825 0.825 0.775]
Mean Cross-Validation Accuracy (5-Fold): 0.8300 (+/- 0.0663)





--- Logistic Regression (Lasso / L1, C=0.1) ---
Individual Fold Accuracies: [0.75  0.7   0.725 0.7   0.725]
Mean Cross-Validation Accuracy (5-Fold): 0.7200 (+/- 0.0374)

--- Logistic Regression (Elastic Net, C=0.1, l1_ratio=0.5) ---
Individual Fold Accuracies: [0.85  0.8   0.775 0.775 0.75 ]
Mean Cross-Validation Accuracy (5-Fold): 0.7900 (+/- 0.0678)

--- Part 2: K-Nearest Neighbor (KNN) Comparison ---





--- K-Nearest Neighbor (K=1) ---
Individual Fold Accuracies: [0.95  0.85  0.875 0.925 0.85 ]
Mean Cross-Validation Accuracy (5-Fold): 0.8900 (+/- 0.0812)

--- K-Nearest Neighbor (K=3) ---
Individual Fold Accuracies: [0.95  0.8   0.8   0.875 0.85 ]
Mean Cross-Validation Accuracy (5-Fold): 0.8550 (+/- 0.1114)

--- K-Nearest Neighbor (K=5) ---
Individual Fold Accuracies: [0.925 0.8   0.775 0.8   0.775]
Mean Cross-Validation Accuracy (5-Fold): 0.8150 (+/- 0.1122)

--- KNN Performance Summary ---
KNN (K=1) Mean CV Accuracy: 0.8900
KNN (K=3) Mean CV Accuracy: 0.8550
KNN (K=5) Mean CV Accuracy: 0.8150

Best performing KNN model is K=1 with accuracy: 0.8900
