<a href="https://colab.research.google.com/github/Auxilus08/DHV/blob/main/Practical_2_DHV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [11]:
np.random.seed(42)
data = pd.DataFrame({
    'ID': range(1,101),
    'Gender': np.random.choice(['Male', 'Female'], size=100),
    'Location': np.random.choice(['North', 'South', 'East', 'West'], size=100),
}
)

In [12]:
print("Original Sample: ")
print(data.head())

Original Sample: 
   ID  Gender Location
0   1    Male     East
1   2  Female    South
2   3    Male    South
3   4    Male     West
4   5    Male    South


## Simple Random Sampling

In [13]:
simple_random_sample = data.sample(n=10, random_state=42)
print("\nSimple Random Sample: ")
print(simple_random_sample)


Simple Random Sample: 
    ID  Gender Location
83  84  Female     East
53  54  Female     East
70  71  Female     West
45  46  Female     East
44  45  Female     West
39  40    Male    North
22  23  Female    South
80  81    Male    North
10  11    Male     West
0    1    Male     East


## Stratified Sampling (Based on 'Gender')

In [17]:
stratified_sample = data.groupby('Gender', group_keys=False).apply(lambda x: x.sample(frac=0.2, random_state=42))
print("\n stratified Sample: ")
print(stratified_sample)


 stratified Sample: 
    ID  Gender Location
1    2  Female    South
17  18  Female    South
68  69  Female     East
27  28  Female    North
36  37  Female    North
93  94  Female     East
71  72  Female    North
49  50  Female     West
86  87  Female    South
26  27  Female    South
97  98  Female     West
78  79    Male    North
52  53    Male    North
54  55    Male    North
74  75    Male     West
67  68    Male     East
84  85    Male    North
6    7    Male    South
19  20    Male    North
11  12    Male     East


  stratified_sample = data.groupby('Gender', group_keys=False).apply(lambda x: x.sample(frac=0.2, random_state=42))


## Cluster Sampling (Using 'Locations' as cluster)

In [18]:
clusters = data['Location'].unique()
selected_cluster = np.random.choice(clusters, size=2, replace=False)
cluster_sample = data[data['Location'].isin(selected_cluster)]
print("\nCluster Sample: ")
print(cluster_sample)


Cluster Sample: 
     ID  Gender Location
0     1    Male     East
1     2  Female    South
2     3    Male    South
4     5    Male    South
5     6  Female    South
6     7    Male    South
8     9    Male    South
9    10  Female     East
11   12    Male     East
13   14    Male    South
14   15  Female     East
17   18  Female    South
22   23  Female    South
23   24  Female     East
26   27  Female    South
34   35  Female     East
38   39    Male     East
45   46  Female     East
46   47  Female     East
47   48    Male     East
50   51    Male     East
51   52  Female     East
53   54  Female     East
55   56  Female    South
56   57  Female     East
57   58    Male    South
60   61    Male     East
64   65    Male    South
67   68    Male     East
68   69  Female     East
69   70  Female    South
72   73    Male     East
75   76  Female    South
76   77  Female     East
77   78  Female     East
79   80  Female     East
81   82  Female     East
82   83    Male    South
83   84

## Systematic sampling (every 10th record)

In [20]:
k = 10
start = np.random.randint(0,k)
systematic_sample = data.iloc[start::k]
print("\nSystematic Sample: ")
print(systematic_sample)


Systematic Sample: 
    ID  Gender Location
3    4    Male     West
13  14    Male    South
23  24  Female     East
33  34  Female    North
43  44  Female     West
53  54  Female     East
63  64    Male     West
73  74  Female     West
83  84  Female     East
93  94  Female     East


## Implement Non-probability Sampling

In [21]:
np.random.seed(42)
data = pd.DataFrame({
    'ID': range(1,201),
    'Age': np.random.randint(18, 60, size=200),
    'Gender': np.random.choice(['Male', 'Female'], size=200),
    'Location': np.random.choice(['Urban', 'Rural'], size=200),
    'Occupation': np.random.choice(['Student', 'Teacher', 'Engineer', 'Doctor'], size=200),
})

## Quota Sampling

In [22]:
quota_sample_male = data[data['Gender'] == 'Male'].head(10)
quota_sample_female = data[data['Gender'] == 'Female'].head(10)
quota_sample = pd.concat([quota_sample_male, quota_sample_female])
print("\nQuota Sample: ")
print(quota_sample)


Quota Sample: 
    ID  Age  Gender Location Occupation
0    1   56    Male    Rural     Doctor
3    4   25    Male    Urban   Engineer
8    9   28    Male    Urban   Engineer
9   10   28    Male    Rural   Engineer
12  13   57    Male    Urban   Engineer
13  14   41    Male    Rural    Student
15  16   39    Male    Urban     Doctor
18  19   47    Male    Rural     Doctor
19  20   55    Male    Urban    Student
22  23   50    Male    Urban   Engineer
1    2   46  Female    Rural     Doctor
2    3   32  Female    Rural    Teacher
4    5   38  Female    Rural     Doctor
5    6   56  Female    Urban    Student
6    7   36  Female    Rural    Student
7    8   40  Female    Urban    Student
10  11   41  Female    Rural    Student
11  12   53  Female    Rural     Doctor
14  15   20  Female    Urban    Student
16  17   19  Female    Urban    Teacher


## Snowball Sampling

In [24]:
initial = data[(data['Occupation'] == 'Engineer')].sample(n = 1, random_state=42)
snowball_ids = set(initial['ID'])

for _ in range(4):
  referrals = data[((data['Occupation'] == 'Engineer') & (~data['ID'].isin(snowball_ids)))].head(2)
  snowball_ids.update(referrals['ID'])

snowball_sample = data[data['ID'].isin(snowball_ids)]
print("\nSnowball Sample: ")
print(snowball_sample)


Snowball Sample: 
    ID  Age  Gender Location Occupation
3    4   25    Male    Urban   Engineer
8    9   28    Male    Urban   Engineer
9   10   28    Male    Rural   Engineer
12  13   57    Male    Urban   Engineer
17  18   41  Female    Rural   Engineer
22  23   50    Male    Urban   Engineer
26  27   44    Male    Urban   Engineer
28  29   45    Male    Urban   Engineer
63  64   53  Female    Urban   Engineer


## judgement Sampling

In [25]:
judgement_sample  = data[(data['Age'] > 40) & (data['Occupation'] == 'Doctor')]
print("\nJudgement Sample: ")
print(judgement_sample)


Judgement Sample: 
      ID  Age  Gender Location Occupation
0      1   56    Male    Rural     Doctor
1      2   46  Female    Rural     Doctor
11    12   53  Female    Rural     Doctor
18    19   47    Male    Rural     Doctor
32    33   54  Female    Urban     Doctor
42    43   43  Female    Rural     Doctor
60    61   43  Female    Rural     Doctor
65    66   48    Male    Rural     Doctor
70    71   57    Male    Rural     Doctor
75    76   43  Female    Rural     Doctor
87    88   41  Female    Urban     Doctor
95    96   59    Male    Rural     Doctor
96    97   56    Male    Rural     Doctor
103  104   51  Female    Rural     Doctor
106  107   41  Female    Urban     Doctor
107  108   54    Male    Rural     Doctor
111  112   44  Female    Urban     Doctor
115  116   54  Female    Urban     Doctor
120  121   43    Male    Rural     Doctor
123  124   44  Female    Rural     Doctor
142  143   49  Female    Rural     Doctor
166  167   44    Male    Urban     Doctor
170  171   59 

## Convenience Sampling

In [26]:
convenience_sample = data.head(15)
print("\nConvenience Sample: ")
print(convenience_sample)


Convenience Sample: 
    ID  Age  Gender Location Occupation
0    1   56    Male    Rural     Doctor
1    2   46  Female    Rural     Doctor
2    3   32  Female    Rural    Teacher
3    4   25    Male    Urban   Engineer
4    5   38  Female    Rural     Doctor
5    6   56  Female    Urban    Student
6    7   36  Female    Rural    Student
7    8   40  Female    Urban    Student
8    9   28    Male    Urban   Engineer
9   10   28    Male    Rural   Engineer
10  11   41  Female    Rural    Student
11  12   53  Female    Rural     Doctor
12  13   57    Male    Urban   Engineer
13  14   41    Male    Rural    Student
14  15   20  Female    Urban    Student
