<a href="https://colab.research.google.com/github/EmmanuelGbafore/Pantech-Solution_Data-Analytics-Master-Class/blob/main/population_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Code for google colab
# sampling a data from the whole population
# import library

import pandas as pd
import numpy as np

# Load dataset from local directory
from google.colab import files
uploaded = files.upload()

# Load dataset
dataset = pd.read_csv('Dataset.csv')
print(dataset.shape)                             # Gives rows and column (1460,3)
print(dataset.describe())                        # Gives std, min, max, etc.
print(dataset.head(5))                           # Prints only the top 5 data since the file is large.




Saving Dataset.csv to Dataset.csv
(1460, 3)
                Id        LotArea      SalePrice
count  1460.000000    1460.000000    1460.000000
mean    730.500000   10516.828082  180921.195890
std     421.610009    9981.264932   79442.502883
min       1.000000    1300.000000   34900.000000
25%     365.750000    7553.500000  129975.000000
50%     730.500000    9478.500000  163000.000000
75%    1095.250000   11601.500000  214000.000000
max    1460.000000  215245.000000  755000.000000
   Id  LotArea  SalePrice
0   1     8450     208500
1   2     9600     181500
2   3    11250     223500
3   4     9550     140000
4   5    14260     250000


In [3]:
# Simple Random Sampling
# In simple random sampling the population has an equal chance of being selected. Selection is entirely random, often done through a random number generator or drawing lots.

simpleRandomSample = dataset.sample(n=10).sort_values (by = "Id")                        # n = 10 (means 10 data)
mean_simpleRandomSample = round (simpleRandomSample ['SalePrice'].mean(),3)              # mean of SalePrice to 3 decimal places
print("Mean of SimpleRandomSample: ", mean_simpleRandomSample)
print(simpleRandomSample)

Mean of SimpleRandomSample:  246670.0
        Id  LotArea  SalePrice
135    136    10400     174000
223    224    10500      97000
473    474    14977     440000
481    482    11846     374000
664    665    20896     423000
745    746     8963     299800
986    987     5310     117000
1030  1031     7082     160000
1067  1068     9760     167900
1226  1227    14598     214000


In [5]:
# Systemic sampling
# Individuals are selected at regular intervals from an ordered list. After randomly selecting a starting point, every k-th individual is chosen, where k is a fixed interval.
# Example: If you have a list of 1,000 names and need a sample of 100, you might select every 10th name after a random starting point to get a representative sample.

index = np.arange(0,len(dataset),step=5)                           # Step means we take 5 steps & pick one data
systematicSample = dataset.iloc[index]                             # Based on the index we are going to get the syntex
mean_systematicSample = round(systematicSample ['SalePrice'].mean(),3)
print("Mean of systematicSample: ", mean_simpleRandomSample)
print(systematicSample)


Mean of systematicSample:  246670.0
        Id  LotArea  SalePrice
0        1     8450     208500
5        6    14115     143000
10      11    11200     129500
15      16     6120     132000
20      21    14215     325300
...    ...      ...        ...
1435  1436     8400     174000
1440  1441    11526     191000
1445  1446     8400     129000
1450  1451     9000     136000
1455  1456     7917     175000

[292 rows x 3 columns]


In [6]:
# Cluster Sampling
# Definition: The population is divided into groups, or "clusters," which are then randomly selected. All members within chosen clusters are included in the sample.
#Example: If you’re studying schools in a district, each school might represent a cluster. After randomly selecting 5 schools (clusters), you survey every student in each selected school.

n = 5

# Divide the units of cluster of equal size
dataset['cluster_id'] = np.repeat([range(1,n+1)], len(dataset)/n)
index = []

# For this formula, cluster id must be an even number
for i in range (0,len(dataset)):
  if dataset['cluster_id'].iloc[i]%2==0:
    index.append(i)
    clusterSample = dataset.iloc[index]
    mean_clusterSample = round(clusterSample['SalePrice'].mean(),3)
    print("Mean of Cluster Sample: ", mean_clusterSample)



Mean of Cluster Sample:  131000.0
Mean of Cluster Sample:  183000.0
Mean of Cluster Sample:  177666.667
Mean of Cluster Sample:  168875.0
Mean of Cluster Sample:  165500.0
Mean of Cluster Sample:  177750.0
Mean of Cluster Sample:  177357.143
Mean of Cluster Sample:  175000.0
Mean of Cluster Sample:  173000.0
Mean of Cluster Sample:  182400.0
Mean of Cluster Sample:  184454.545
Mean of Cluster Sample:  181575.0
Mean of Cluster Sample:  190300.0
Mean of Cluster Sample:  198557.143
Mean of Cluster Sample:  200320.0
Mean of Cluster Sample:  193393.75
Mean of Cluster Sample:  186870.588
Mean of Cluster Sample:  196488.889
Mean of Cluster Sample:  194863.158
Mean of Cluster Sample:  191720.0
Mean of Cluster Sample:  188300.0
Mean of Cluster Sample:  196786.364
Mean of Cluster Sample:  195969.565
Mean of Cluster Sample:  195658.333
Mean of Cluster Sample:  198232.0
Mean of Cluster Sample:  200992.308
Mean of Cluster Sample:  203177.778
Mean of Cluster Sample:  202617.857
Mean of Cluster Sampl

In [8]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

# Load the dataset
dataset = pd.read_csv('Dataset.csv')

# Create a 'strata' column to divide the population into subgroups (strata)
dataset['strata'] = np.repeat([1, 2], len(dataset) // 2)

# Initialize StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=8)

# Perform stratified sampling based on the 'strata' column
for train_index, test_index in split.split(dataset, dataset['strata']):
    stratifiedRandomSample = dataset.iloc[test_index].sort_values(by='SalePrice')

# Display the stratified sample
stratifiedRandomSample


Unnamed: 0,Id,LotArea,SalePrice,strata
734,735,8978,108000,2
140,141,10500,115000,1
5,6,14115,143000,1
1262,1263,11250,161500,2
744,745,5395,180000,2
712,713,4671,189000,1
1228,1229,8769,367294,2
231,232,15138,403000,1
