In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Sample dataset
np.random.seed(42)
data = pd.DataFrame({
    "ID": np.arange(1, 101),  # 100 individuals
    "Age": np.random.randint(18, 60, 100),
    "Income": np.random.randint(20000, 100000, 100),
    "Category": np.random.choice(["A", "B", "C"], 100)  # Categorical stratification
})

### 1. Stratified Sampling
stratified_sample = data.groupby("Category", group_keys=False).apply(lambda x: x.sample(frac=0.3))  # 30% per group

### 2. Simple Random Sampling
simple_random_sample = data.sample(n=30, random_state=42)  # Select 30 random samples

### 3. Systematic Sampling (Every 3rd individual)
systematic_sample = data.iloc[::3]  # Choose every 3rd row

### 4. Cluster Sampling (Assume ID groups as clusters)
num_clusters = 5  # Define number of clusters
data["Cluster"] = np.random.randint(1, num_clusters + 1, len(data))
random_clusters = np.random.choice(data["Cluster"].unique(), size=2, replace=False)  # Select 2 clusters randomly
cluster_sample = data[data["Cluster"].isin(random_clusters)]

### 5. Convenience Sampling (First 10 rows)
convenience_sample = data.head(10)

# Display Samples
print("Stratified Sample:\n", stratified_sample.head())
print("\nSimple Random Sample:\n", simple_random_sample.head())
print("\nSystematic Sample:\n", systematic_sample.head())
print("\nCluster Sample:\n", cluster_sample.head())
print("\nConvenience Sample:\n", convenience_sample.head())


Stratified Sample:
     ID  Age  Income Category
34  35   38   22693        A
60  61   43   93530        A
80  81   18   83734        A
98  99   45   92082        A
9   10   28   44538        A

Simple Random Sample:
     ID  Age  Income Category
83  84   26   32688        C
53  54   21   78053        A
70  71   57   43664        A
45  46   45   64262        B
44  45   37   41976        A

Systematic Sample:
     ID  Age  Income Category
0    1   56   28392        C
3    4   25   72256        A
6    7   36   99575        B
9   10   28   44538        A
12  13   57   99309        C

Cluster Sample:
    ID  Age  Income Category  Cluster
1   2   46   50535        C        5
2   3   32   98603        C        5
3   4   25   72256        A        5
5   6   56   97373        C        4
8   9   28   30965        C        5

Convenience Sample:
    ID  Age  Income Category  Cluster
0   1   56   28392        C        1
1   2   46   50535        C        5
2   3   32   98603        C        5
3  

  stratified_sample = data.groupby("Category", group_keys=False).apply(lambda x: x.sample(frac=0.3))  # 30% per group
