Title: Cross-Validation


Task 1: K-Fold Cross-Validation for House Prices<br>
Apply K-Fold Cross-Validation (K=5) to check variability in performance.

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Load dataset
data = fetch_california_housing()
X, y = data.data, data.target

# Define the pipeline: StandardScaler + LinearRegression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])

# Define 5-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate model using cross_val_score (R^2 score)
scores = cross_val_score(pipeline, X, y, cv=kf, scoring='r2')

# Output results
print("Cross-Validation R^2 Scores:", scores)
print(f"Mean R^2 Score: {scores.mean():.4f}")
print(f"Standard Deviation: {scores.std():.4f}")


Cross-Validation R^2 Scores: [0.57578771 0.61374822 0.60856043 0.62126494 0.5875292 ]
Mean R^2 Score: 0.6014
Standard Deviation: 0.0170


Task 2: Stratified K-Fold for Imbalanced Churn Dataset<br>
Use Stratified K-Fold to ensure each class is represented.

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Simulate an imbalanced churn dataset
np.random.seed(42)
X = np.random.rand(1000, 10)
y = np.random.choice([0, 1], size=1000, p=[0.85, 0.15])  # 85% no-churn, 15% churn

# Define pipeline: standardization + logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='liblinear'))  # suitable for small/imbalanced sets
])

# Define stratified K-Fold (5 splits)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate with accuracy (you can also use 'f1' or 'roc_auc')
scores = cross_val_score(pipeline, X, y, cv=skf, scoring='accuracy')

# Output results
print("Stratified K-Fold Accuracy Scores:", scores)
print(f"Mean Accuracy: {scores.mean():.4f}")
print(f"Standard Deviation: {scores.std():.4f}")


Stratified K-Fold Accuracy Scores: [0.84  0.84  0.84  0.84  0.835]
Mean Accuracy: 0.8390
Standard Deviation: 0.0020


Task 3: Leave-One-Out Cross-Validation for Iris<br>
Use LOOCV to assess model prediction for the Iris dataset.

In [3]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Define the pipeline: StandardScaler + LogisticRegression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=200))
])

# Initialize Leave-One-Out Cross-Validation
loo = LeaveOneOut()

# Perform cross-validation and compute accuracy for each iteration
scores = cross_val_score(pipeline, X, y, cv=loo, scoring='accuracy')

# Output results
print("LOOCV Accuracy Scores (first 10 shown):", scores[:10])
print(f"Total samples: {len(scores)}")
print(f"Mean Accuracy: {scores.mean():.4f}")


LOOCV Accuracy Scores (first 10 shown): [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Total samples: 150
Mean Accuracy: 0.9533
