# Random Sampling

The simplest data sampling technique that creates a random sample from the original population is Random Sampling. In this approach, every sampled observation has the same probability of getting selected during the sample generation process. Random Sampling is usually used when we don’t have any kind of prior information about the target population.

In [3]:
import numpy as np

# generating population data 
N = 10000
mu = 10
std = 2
population_df = np.random.normal(mu,std,N)


def random_sampling(df, n):
    random_sample = np.random.choice(df,replace = False, size = n)
    return(random_sample)
randomSample = random_sampling(population_df, N)


In [4]:
randomSample

array([ 8.59126108,  5.15208278, 10.17636753, ...,  8.83574653,
        8.51991019, 10.76697452])

# Systematic Sampling

Systematic sampling is defined as a probability sampling approach where the elements from a target population are selected from a random starting point and after a fixed sampling interval.

In [5]:
import numpy as np
import pandas as pd
# generating population data following Normal Distribution
N = 10000
mu = 10
std = 2
population_df = np.random.normal(mu,std,N)


def systematic_sampling(df, step):
    id = pd.Series(np.arange(1,len(df),1))
    df = pd.Series(df)
    df_pd = pd.concat([id, df], axis = 1)
    df_pd.columns = ["id", "data"]
    
    selected_index = np.arange(1,len(df),step)
    
    systematic_sampling = df_pd.iloc[selected_index]
    return(systematic_sampling)



In [6]:
n = 10
step = int(N/n)
sample = systematic_sampling(population_df, step)
sample

Unnamed: 0,id,data
1,2.0,10.234306
1001,1002.0,9.397715
2001,2002.0,6.201776
3001,3002.0,9.391794
4001,4002.0,8.097226
5001,5002.0,9.537761
6001,6002.0,8.958952
7001,7002.0,14.340817
8001,8002.0,11.602476
9001,9002.0,7.982612


# Cluster Sampling

Cluster sampling is a probability sampling technique where we divide the population into multiple clusters(groups) based on certain clustering criteria. Then we select a random cluster(s) with simple random or systematic sampling techniques. So, in cluster sampling, the entire population is divided into clusters or segments and then cluster(s) are randomly selected.

In [7]:
import numpy as np
import pandas as pd

# Generating Population data 
price_vb = pd.Series(np.random.uniform(1,4,size = N))
id = pd.Series(np.arange(0,len(price_vb),1))
event_type = pd.Series(np.random.choice(["type1","type2","type3"],size = len(price_vb)))
click = pd.Series(np.random.choice([0,1],size = len(price_vb)))
df = pd.concat([id,price_vb,event_type, click],axis = 1)
df.columns = ["id","price","event_type", "click"]
df

Unnamed: 0,id,price,event_type,click
0,0,3.880985,type2,1
1,1,2.462943,type2,1
2,2,1.201094,type1,1
3,3,2.582788,type3,1
4,4,3.405966,type3,1
...,...,...,...,...
9995,9995,2.553690,type3,0
9996,9996,1.484825,type3,0
9997,9997,3.115476,type2,0
9998,9998,2.737626,type2,0


In [8]:
def get_clustered_Sample(df, n_per_cluster, num_select_clusters):
    N = len(df)
    K = int(N/n_per_cluster)
    data = None
    for k in range(K):
        sample_k = df.sample(n_per_cluster)
        sample_k["cluster"] = np.repeat(k,len(sample_k))
        df = df.drop(index = sample_k.index)
        data = pd.concat([data,sample_k],axis = 0)

    random_chosen_clusters = np.random.randint(0,K,size = num_select_clusters)
    samples = data[data.cluster.isin(random_chosen_clusters)]
    return(samples)

sample = get_clustered_Sample(df = df, n_per_cluster = 100, num_select_clusters = 2)
sample

Unnamed: 0,id,price,event_type,click,cluster
541,541,2.446055,type1,0,32
5135,5135,1.873098,type2,1,32
3032,3032,2.024762,type2,0,32
1805,1805,1.908144,type1,1,32
3020,3020,1.160381,type3,1,32
...,...,...,...,...,...
1553,1553,2.991410,type1,1,91
3576,3576,2.879654,type2,1,91
2615,2615,3.386933,type1,1,91
1041,1041,1.570456,type1,1,91


# Stratified Sampling

Stratified Sampling is a data sampling approach, where we divide a population into homogeneous subpopulations called strata based on specific characteristics (e.g., age, race, gender identity, location, event type etc.).

In [9]:
def get_startified_sample(df,n,num_clusters_needed):
    N = len(df)
    num_obs_per_cluster = int(N/n)
    K = int(N/num_obs_per_cluster)

    def get_weighted_sample(df,num_obs_per_cluster):
        def get_sample_per_class(x):
            n_x = int(np.rint(num_obs_per_cluster*len(x[x.click !=0])/len(df[df.click !=0])))
            sample_x = x.sample(n_x)
            return(sample_x)
        weighted_sample = df.groupby("event_type").apply(get_sample_per_class)
        return(weighted_sample)

    stratas = None
    for k in range(K):
        weighted_sample_k = get_weighted_sample(df,num_obs_per_cluster).reset_index(drop = True)
        weighted_sample_k["cluster"] = np.repeat(k,len(weighted_sample_k))
        stratas = pd.concat([stratas, weighted_sample_k],axis = 0)
        df.drop(index = weighted_sample_k.index)
    selected_strata_clusters = np.random.randint(0,K,size = num_clusters_needed)
    stratified_samples = stratas[stratas.cluster.isin(selected_strata_clusters)]
    return(stratified_samples)

sample = get_startified_sample(df = df,n = 100,num_clusters_needed = 2)
sample

Unnamed: 0,id,price,event_type,click,cluster
0,3627,2.848576,type1,0,19
1,8318,2.778787,type1,1,19
2,3732,1.548856,type1,1,19
3,3516,3.021503,type1,0,19
4,7643,3.309755,type1,1,19
...,...,...,...,...,...
94,8108,3.408934,type3,0,95
95,7029,3.594101,type3,0,95
96,3675,2.115939,type3,1,95
97,4754,3.735683,type3,1,95
