<a href="https://colab.research.google.com/github/AnaghaSreenath/ML-lab/blob/main/2_way%2C3way_split_%2Ck_flod.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)


In [None]:
# Dataset parameters
n_samples = 500
n_features = 10
n_bins = 3

# Generate input features
X = np.random.rand(n_samples, n_features)

# True coefficients
true_coefficients = np.random.randn(n_features)

# Continuous target (regression)
y_continuous = X @ true_coefficients + np.random.normal(0, 0.1, size=n_samples)

# Convert continuous target to classes
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
y_classes = discretizer.fit_transform(y_continuous.reshape(-1, 1)).astype(int).ravel()

# Add polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


In [None]:
# 2-way split: 80% Train, 20% Test
X_train, X_test, y_train_cont, y_test_cont, y_train_cls, y_test_cls = train_test_split(
    X_poly, y_continuous, y_classes, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train_cont)

# Predictions
y_pred_cont = model.predict(X_test)
y_pred_cls = discretizer.transform(y_pred_cont.reshape(-1, 1)).astype(int).ravel()

# Regression metrics
print("=== TWO-WAY SPLIT: REGRESSION ===")
print("MAE:", mean_absolute_error(y_test_cont, y_pred_cont))
print("MSE:", mean_squared_error(y_test_cont, y_pred_cont))
print("RMSE:", np.sqrt(mean_squared_error(y_test_cont, y_pred_cont)))
print("R2:", r2_score(y_test_cont, y_pred_cont))

# Classification metrics
print("\n=== TWO-WAY SPLIT: CLASSIFICATION ===")
print("Confusion Matrix:\n", confusion_matrix(y_test_cls, y_pred_cls))
print("Accuracy:", accuracy_score(y_test_cls, y_pred_cls))
print("Precision:", precision_score(y_test_cls, y_pred_cls, average='weighted'))
print("Recall:", recall_score(y_test_cls, y_pred_cls, average='weighted'))
print("F1 Score:", f1_score(y_test_cls, y_pred_cls, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test_cls, y_pred_cls))


=== TWO-WAY SPLIT: REGRESSION ===
MAE: 0.078344925685216
MSE: 0.009327696433926837
RMSE: 0.09658000017564111
R2: 0.9888528927468484

=== TWO-WAY SPLIT: CLASSIFICATION ===
Confusion Matrix:
 [[13  0  0]
 [ 1 57  1]
 [ 0  1 27]]
Accuracy: 0.97
Precision: 0.9705418719211822
Recall: 0.97
F1 Score: 0.97005698005698

Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        13
           1       0.98      0.97      0.97        59
           2       0.96      0.96      0.96        28

    accuracy                           0.97       100
   macro avg       0.96      0.98      0.97       100
weighted avg       0.97      0.97      0.97       100



In [None]:
# Step 1: Train (60%) and Temp (40%)
X_train, X_temp, y_train_cont, y_temp_cont, y_train_cls, y_temp_cls = train_test_split(
    X_poly, y_continuous, y_classes, test_size=0.4, random_state=42
)

# Step 2: Validation (20%) and Test (20%)
X_val, X_test, y_val_cont, y_test_cont, y_val_cls, y_test_cls = train_test_split(
    X_temp, y_temp_cont, y_temp_cls, test_size=0.5, random_state=42
)

print("Train size:", X_train.shape[0])
print("Validation size:", X_val.shape[0])
print("Test size:", X_test.shape[0])

model = LinearRegression()
model.fit(X_train, y_train_cont)

# Predictions
y_pred_cont = model.predict(X_test)
y_pred_cls = discretizer.transform(y_pred_cont.reshape(-1, 1)).astype(int).ravel()

# Regression metrics
print("\n=== THREE-WAY SPLIT: REGRESSION ===")
print("MAE:", mean_absolute_error(y_test_cont, y_pred_cont))
print("MSE:", mean_squared_error(y_test_cont, y_pred_cont))
print("RMSE:", np.sqrt(mean_squared_error(y_test_cont, y_pred_cont)))
print("R2:", r2_score(y_test_cont, y_pred_cont))

# Classification metrics
print("\n=== THREE-WAY SPLIT: CLASSIFICATION ===")
print("Confusion Matrix:\n", confusion_matrix(y_test_cls, y_pred_cls))
print("Accuracy:", accuracy_score(y_test_cls, y_pred_cls))
print("Precision:", precision_score(y_test_cls, y_pred_cls, average='weighted'))
print("Recall:", recall_score(y_test_cls, y_pred_cls, average='weighted'))
print("F1 Score:", f1_score(y_test_cls, y_pred_cls, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test_cls, y_pred_cls))


Train size: 300
Validation size: 100
Test size: 100

=== THREE-WAY SPLIT: REGRESSION ===
MAE: 0.0929070354539247
MSE: 0.01360284244435707
RMSE: 0.11663122413983774
R2: 0.9799587810885267

=== THREE-WAY SPLIT: CLASSIFICATION ===
Confusion Matrix:
 [[ 9  3  0]
 [ 2 61  1]
 [ 0  3 21]]
Accuracy: 0.91
Precision: 0.9099592944369063
Recall: 0.91
F1 Score: 0.9090740126120147

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.75      0.78        12
           1       0.91      0.95      0.93        64
           2       0.95      0.88      0.91        24

    accuracy                           0.91       100
   macro avg       0.89      0.86      0.88       100
weighted avg       0.91      0.91      0.91       100



In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores, mae_scores = [], []
accuracy_scores, f1_scores = [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_poly), 1):
    X_train, X_val = X_poly[train_idx], X_poly[val_idx]
    y_train_cont, y_val_cont = y_continuous[train_idx], y_continuous[val_idx]
    y_train_cls, y_val_cls = y_classes[train_idx], y_classes[val_idx]

    model = LinearRegression()
    model.fit(X_train, y_train_cont)

    y_pred_cont = model.predict(X_val)
    y_pred_cls = discretizer.transform(y_pred_cont.reshape(-1, 1)).astype(int).ravel()

    r2_scores.append(r2_score(y_val_cont, y_pred_cont))
    mae_scores.append(mean_absolute_error(y_val_cont, y_pred_cont))
    accuracy_scores.append(accuracy_score(y_val_cls, y_pred_cls))
    f1_scores.append(f1_score(y_val_cls, y_pred_cls, average='weighted'))

    print(f"Fold {fold}: R2={r2_scores[-1]:.4f}, Accuracy={accuracy_scores[-1]:.4f}")

print("\n=== K-FOLD AVERAGE METRICS ===")
print("Average R2:", np.mean(r2_scores))
print("Average MAE:", np.mean(mae_scores))
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average F1 Score:", np.mean(f1_scores))


Fold 1: R2=0.9889, Accuracy=0.9700
Fold 2: R2=0.9781, Accuracy=0.9200
Fold 3: R2=0.9859, Accuracy=0.9400
Fold 4: R2=0.9825, Accuracy=0.9400
Fold 5: R2=0.9887, Accuracy=0.9500

=== K-FOLD AVERAGE METRICS ===
Average R2: 0.984826971554357
Average MAE: 0.0900900063399755
Average Accuracy: 0.944
Average F1 Score: 0.9436793425753477
