<a href="https://colab.research.google.com/github/Dibyajyoti-Pradhan/Capstone-Project-Imperial-College-London/blob/main/Data-Split-Strategy-Analysis/Solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings

warnings.filterwarnings('ignore')

In [46]:
# Load the default wine dataset
data = load_wine()
X = data.data
y = data.target

print("Dataset Information:")
print(f"  Features: {X.shape[1]}")
print(f"  Samples: {X.shape[0]}")
print(f"  Classes: {list(data.target_names)}")

Dataset Information:
  Features: 13
  Samples: 178
  Classes: [np.str_('class_0'), np.str_('class_1'), np.str_('class_2')]


In [47]:
print("=" * 60)
print("EXPERIMENT 1: 70:15:15 Split (Training:Validation:Test)")
print("=" * 60)

# Split data into train (70%), validation (15%) and test (15%) sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=42, stratify=y_train_val
)
# 0.1765 * 0.85 = 0.15, so validation is ~15% of total

# Display split sizes
print("\nDataset Split Sizes:")
print(f"  Training samples:   {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"  Validation samples: {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")
print(f"  Test samples:       {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")
print(f"  Total samples:      {len(X)}")

EXPERIMENT 1: 70:15:15 Split (Training:Validation:Test)

Dataset Split Sizes:
  Training samples:   124 (69.7%)
  Validation samples: 27 (15.2%)
  Test samples:       27 (15.2%)
  Total samples:      178


In [48]:
# Fit the logistic regression model
model_1 = LogisticRegression(max_iter=1000, random_state=42)
model_1.fit(X_train, y_train)

# Evaluate the validation set
val_preds_1 = model_1.predict(X_val)
val_accuracy_1 = accuracy_score(y_val, val_preds_1)
print(f"Validation Accuracy: {val_accuracy_1:.4f}")

# Evaluate the test set
test_preds_1 = model_1.predict(X_test)
test_accuracy_1 = accuracy_score(y_test, test_preds_1)
print(f"Test Accuracy: {test_accuracy_1:.4f}")

# Detailed classification report on test set
print("\nClassification Report on Test Set:")
print(classification_report(y_test, test_preds_1, target_names=data.target_names))

Validation Accuracy: 1.0000
Test Accuracy: 0.9630

Classification Report on Test Set:
              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00         9
     class_1       0.92      1.00      0.96        11
     class_2       1.00      0.86      0.92         7

    accuracy                           0.96        27
   macro avg       0.97      0.95      0.96        27
weighted avg       0.97      0.96      0.96        27



In [49]:
print("=" * 60)
print("EXPERIMENT 2: 60:20:20 Split (Training:Validation:Test)")
print("=" * 60)

# Split data into train (60%), validation (20%) and test (20%) sets
X_train_val_2, X_test_2, y_train_val_2, y_test_2 = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(
    X_train_val_2, y_train_val_2, test_size=0.25, random_state=42, stratify=y_train_val_2
)
# 0.25 * 0.80 = 0.20, so validation is 20% of total

# Display split sizes
print("\nDataset Split Sizes:")
print(f"  Training samples:   {len(X_train_2)} ({len(X_train_2)/len(X)*100:.1f}%)")
print(f"  Validation samples: {len(X_val_2)} ({len(X_val_2)/len(X)*100:.1f}%)")
print(f"  Test samples:       {len(X_test_2)} ({len(X_test_2)/len(X)*100:.1f}%)")
print(f"  Total samples:      {len(X)}")

EXPERIMENT 2: 60:20:20 Split (Training:Validation:Test)

Dataset Split Sizes:
  Training samples:   106 (59.6%)
  Validation samples: 36 (20.2%)
  Test samples:       36 (20.2%)
  Total samples:      178


In [50]:
# Fit the logistic regression model
model_2 = LogisticRegression(max_iter=1000, random_state=42)
model_2.fit(X_train_2, y_train_2)

# Evaluate the validation set
val_preds_2 = model_2.predict(X_val_2)
val_accuracy_2 = accuracy_score(y_val_2, val_preds_2)
print(f"Validation Accuracy: {val_accuracy_2:.4f}")

# Evaluate the test set
test_preds_2 = model_2.predict(X_test_2)
test_accuracy_2 = accuracy_score(y_test_2, test_preds_2)
print(f"Test Accuracy: {test_accuracy_2:.4f}")

# Detailed classification report on test set
print("\nClassification Report on Test Set:")
print(classification_report(y_test_2, test_preds_2, target_names=data.target_names))

Validation Accuracy: 0.9444
Test Accuracy: 0.9722

Classification Report on Test Set:
              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00        12
     class_1       0.93      1.00      0.97        14
     class_2       1.00      0.90      0.95        10

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36



In [51]:
print("=" * 60)
print("RESULTS COMPARISON")
print("=" * 60)

print("\n{:<25} {:<20} {:<20}".format("Split Ratio", "Validation Acc", "Test Acc"))
print("-" * 65)
print("{:<25} {:<20} {:<20}".format("70:15:15", f"{val_accuracy_1:.4f}", f"{test_accuracy_1:.4f}"))
print("{:<25} {:<20} {:<20}".format("60:20:20", f"{val_accuracy_2:.4f}", f"{test_accuracy_2:.4f}"))

RESULTS COMPARISON

Split Ratio               Validation Acc       Test Acc            
-----------------------------------------------------------------
70:15:15                  1.0000               0.9630              
60:20:20                  0.9444               0.9722              
