In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
population_data = pd.read_csv("Fakeschool_1.csv")
population_data.head()

Unnamed: 0,Students,Sex,class,GPA,Honors
0,Alice,F,Fr,3.8,Yes
1,Brad,M,Fr,2.6,Yes
2,Caleb,M,Fr,2.25,No
3,Daisy,F,Fr,2.1,No
4,Faye,F,Fr,2.0,No


In [3]:
population_data.shape

(28, 5)

In [4]:
population_data.duplicated().sum()

0

In [5]:
population_data.isnull().sum()

Students    0
Sex         0
class       0
GPA         0
Honors      0
dtype: int64

In [18]:
population_data['GPA'].mean()

2.766428571428572

## Simple Random Sampling

In [6]:
sample1 = population_data.sample(10)

In [7]:
print(sample1)

    Students Sex class   GPA Honors
9      Chris   M    So  4.00    Yes
4       Faye   F    Fr  2.00     No
2      Caleb   M    Fr  2.25     No
10     Dylan   M    So  3.50    Yes
1       Brad   M    Fr  2.60    Yes
13   Gabriel   M    So  1.98     No
12      Eric   M    So  2.10     No
3      Daisy   F    Fr  2.10     No
15  Brittany   F    Jr  3.90     No
19    Eliott   M    Jr  1.90     No


In [8]:
sample1['GPA'].mean()

2.633

## Systematic Random Sampling

In [19]:
def sys_sampling(data, step):
    
    indexes = np.arange(0,len(data),step=step)
    sample = data.iloc[indexes]
    return sample
    
sample_2 = sys_sampling(population_data, 2)
sample_2_mean = sample_2['GPA'].mean()
sample_2

Unnamed: 0,Students,Sex,class,GPA,Honors
0,Alice,F,Fr,3.8,Yes
2,Caleb,M,Fr,2.25,No
4,Faye,F,Fr,2.0,No
6,Georg,M,Fr,1.4,No
8,Betsy,F,So,4.0,Yes
10,Dylan,M,So,3.5,Yes
12,Eric,M,So,2.1,No
14,Adam,M,Jr,3.98,Yes
16,Cassie,F,Jr,3.75,Yes
18,Faith,F,Jr,2.5,Yes


In [20]:
print(sample_2_mean)

2.7557142857142862


## Cluster Sampling

In [21]:
cluster_1 = population_data[population_data['Sex']=='F']
cluster_2 = population_data[population_data['Sex']=='M']

In [24]:
choice = random.choice(['cluster_1','cluster_2'])
if choice == 'cluster_1':
    print(f"cluster 1 : \n {cluster_1}")
    print(f"Mean of cluster 1 : \n {cluster_1['GPA'].mean()}")
else:
    print(f"cluster 2 : \n {cluster_2}")
    print(f"Mean of cluster 2 : \n {cluster_2['GPA'].mean()}")

cluster 2 : 
    Students Sex class   GPA Honors
1      Brad   M    Fr  2.60    Yes
2     Caleb   M    Fr  2.25     No
6     Georg   M    Fr  1.40     No
9     Chris   M    So  4.00    Yes
10    Dylan   M    So  3.50    Yes
11   Felipe   M    So  3.00     No
12     Eric   M    So  2.10     No
13  Gabriel   M    So  1.98     No
14     Adam   M    Jr  3.98    Yes
17    Derek   M    Jr  3.10    Yes
19   Eliott   M    Jr  1.90     No
20    Garth   M    Jr  1.10     No
22      Bob   M    Sr  3.80    Yes
23     Carl   M    Sr  3.10     No
25    Frank   M    Sr  2.00     No
26       Ed   M    Sr  1.50     No
Mean of cluster 2 : 
 2.581875


## Stratified Sampling

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=12) # size = 10 and getting shuffled once

for x, y in split.split(population_data, population_data['class']):
    stratified_random_sample = population_data.iloc[y]

stratified_random_sample

Unnamed: 0,Students,Sex,class,GPA,Honors
2,Caleb,M,Fr,2.25,No
24,Diana,F,Sr,2.9,No
1,Brad,M,Fr,2.6,Yes
18,Faith,F,Jr,2.5,Yes
9,Chris,M,So,4.0,Yes
23,Carl,M,Sr,3.1,No
19,Eliott,M,Jr,1.9,No
14,Adam,M,Jr,3.98,Yes
25,Frank,M,Sr,2.0,No
10,Dylan,M,So,3.5,Yes


In [25]:
print(stratified_random_sample['GPA'].mean())

2.8191666666666664


## Another way for stratification (logic based)

In [14]:
population_data['class'].value_counts()

Fr    7
So    7
Jr    7
Sr    7
Name: class, dtype: int64

In [15]:
strata1 = population_data[population_data['class']=='Fr']
strata2 = population_data[population_data['class']=='So']
strata3 = population_data[population_data['class']=='Jr']
strata4 = population_data[population_data['class']=='Sr']

In [16]:
st1 = strata1.sample(3)
st2 = strata2.sample(3)
st3 = strata3.sample(3)
st4 = strata4.sample(3)
sample_stratification = st1.append([st2,st3,st4])

In [17]:
sample_stratification

Unnamed: 0,Students,Sex,class,GPA,Honors
4,Faye,F,Fr,2.0,No
6,Georg,M,Fr,1.4,No
2,Caleb,M,Fr,2.25,No
9,Chris,M,So,4.0,Yes
7,Andrea,F,So,4.0,Yes
12,Eric,M,So,2.1,No
16,Cassie,F,Jr,3.75,Yes
19,Eliott,M,Jr,1.9,No
17,Derek,M,Jr,3.1,Yes
23,Carl,M,Sr,3.1,No


In [26]:
print(sample_stratification['GPA'].mean())

2.658333333333333
