# Train Test Splits 

In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')

print("Dataset Shape:", X.shape)
print("Class Distribution:\n", y.value_counts())


Dataset Shape: (150, 4)
Class Distribution:
 target
0    50
1    50
2    50
Name: count, dtype: int64


## Stratified Splitting

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    stratify=y,
    random_state=42  # ensures reproducibility
)

print("\nTraining set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Training class distribution:\n", y_train.value_counts(normalize=True))
print("Test class distribution:\n", y_test.value_counts(normalize=True))


Training set size: (120, 4)
Test set size: (30, 4)
Training class distribution:
 target
0    0.333333
2    0.333333
1    0.333333
Name: proportion, dtype: float64
Test class distribution:
 target
0    0.333333
2    0.333333
1    0.333333
Name: proportion, dtype: float64


## Model Evaluation Consistency

In [4]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy on Test Set:", round(accuracy, 4))


Model Accuracy on Test Set: 0.9


## Avoiding Data Leakage

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_scaled = RandomForestClassifier(random_state=42)
model_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = model_scaled.predict(X_test_scaled)

print("Accuracy after proper scaling:", round(accuracy_score(y_test, y_pred_scaled), 4))

Accuracy after proper scaling: 0.9
