[Reference](https://medium.datadriveninvestor.com/why-your-machine-learning-model-fails-in-production-a-deep-dive-into-sampling-design-698874ed3dfd)

# 1. Random Sampling

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load example dataset
iris = load_iris()
X = iris.data
y = iris.target

# Random sampling (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (120, 4)
Test shape: (30, 4)


# 2. Stratified Sampling

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

# Stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Check distribution
import numpy as np
print("Train class distribution:", np.bincount(y_train))
print("Test class distribution:", np.bincount(y_test))

Train class distribution: [40 40 40]
Test class distribution: [10 10 10]


# 3. Systematic Sampling

In [3]:
import pandas as pd
import numpy as np

# Sample dataset
df = pd.DataFrame({'feature': range(1, 101), 'label': np.random.choice([0, 1], size=100)})

# Step 1: Shuffle the data (optional but recommended unless already randomized)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 2: Define sampling interval
k = 5  # every 5th record goes to test set

# Step 3: Index-based split
test_indices = list(range(0, len(df), k))
train_indices = list(set(range(len(df))) - set(test_indices))

# Step 4: Create train and test sets
train_df = df.loc[train_indices].reset_index(drop=True)
test_df = df.loc[test_indices].reset_index(drop=True)
print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 80
Test size: 20


# 4. Time-based (Temporal) Sampling

In [4]:
import pandas as pd

# Simulated time-series data
df = pd.DataFrame({
    'date': pd.date_range(start='2022-01-01', periods=100),
    'feature': range(100),
    'target': [1 if x < 50 else 0 for x in range(100)]
})

# Sort by date
df = df.sort_values('date')

# Time-based split
train = df[df['date'] < '2022-03-01']
test = df[df['date'] >= '2022-03-01']

print("Train size:", train.shape)
print("Test size:", test.shape)

Train size: (59, 3)
Test size: (41, 3)


# 5. Cluster and Multi-stage Sampling

In [5]:
import pandas as pd
import numpy as np

# Simulate cluster-like data
np.random.seed(42)
df = pd.DataFrame({
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=1000),
    'feature': np.random.randn(1000),
    'target': np.random.randint(0, 2, size=1000)
})

# Choose 2 clusters randomly
clusters = df['region'].unique()
selected_clusters = np.random.choice(clusters, size=2, replace=False)
sampled_df = df[df['region'].isin(selected_clusters)]

print("Sampled from clusters:", selected_clusters)
print("Sample size:", sampled_df.shape)

Sampled from clusters: ['West' 'North']
Sample size: (538, 3)
