## Importing libraries and reading data

In [9]:
# Importing necessary libraries 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew

In [11]:
# Reading dataframe 
df = pd.read_csv('Population_Survey_Data.csv')

In [7]:
df.head()

Unnamed: 0,ID,Age,Gender,Income,Education,Region
0,1,62,Other,96138,Masters,East
1,2,65,Other,76543,PhD,East
2,3,71,Male,138887,Bachelors,East
3,4,18,Other,121643,Bachelors,South
4,5,21,Male,143192,Masters,East


## Introduction to statistics

In [44]:
# Statistics is the branch of mathematics that deals with collecting, organizing, analyzing, 
# interpreting, and presenting data to make decisions or draw conclusions.

In [46]:
# Need for Statistics

# To summarize large data meaningfully
# To make decisions under uncertainty
# To identify trends and patterns
# To test hypotheses and validate assumptions
# To predict future outcomes

In [None]:
# Types of Data

# Qualitative (Categorical) Data – Describes qualities or categories
# Nominal: No order (e.g., gender, colors)
# Ordinal: Ordered categories (e.g., ranks, satisfaction level)


# Quantitative (Numerical) Data – Measured in numbers
# Discrete: Countable (e.g., number of students)
# Continuous: Measurable, infinite values (e.g., height, weight)
# Interval: Numeric scale, equal intervals, no true zero (e.g., temperature in °C, dates)
# Ratio: Numeric scale, equal intervals, true zero (e.g., weight, age, length)

# Sampling Technique
### 1.Random
### 2.Stratified 
### 3.Cluster
### 4.Systemic 

In [None]:
# Sampling technique refers to the method used to select a subset of individuals, items, or observations from 
# a larger population, so that this subset can represent the entire population for research or analysis.

#Why use sampling techniques?
#Studying the whole population is often time-consuming, costly, or impractical.
#Sampling helps in saving time, cost, and effort while still giving reliable results
#Ensures that the sample is representative, reducing bias in conclusions.

#Main types of sampling techniques
#Probability Sampling (every member has a known chance of selection)
#Simple Random Sampling – Equal chance for each member
#Systematic Sampling – Selecting every k-th member
#Stratified Sampling – Dividing into strata and sampling from each
#Cluster Sampling – Dividing into clusters and sampling entire clusters

In [24]:
# 1.Random
# Took random sample of 100 from data
random_sample = df.sample(100, random_state= 10)
random_sample.head()

Unnamed: 0,ID,Age,Gender,Income,Education,Region
841,842,32,Other,116767,PhD,West
956,957,28,Male,32012,Bachelors,East
544,545,48,Female,128088,Masters,South
173,174,61,Female,118410,High School,West
759,760,72,Male,32933,PhD,West


In [28]:
# 2.Stratified 
# We are taking samples using two methods
print('## Method 1 using Pandas ##')
print()
statified_sample_1 = df.groupby('Gender', group_keys= False).apply(lambda x: x.sample(25)) # In pandas, the group_keys parameter in groupby() 
                #controls whether the group labels (the keys you grouped by) are included in the index of the result when you apply a function.
statified_sample_1.head()

## Method 1 using Pandas ##



  statified_sample_1 = df.groupby('Gender', group_keys= False).apply(lambda x: x.sample(25)) # In pandas, the group_keys parameter in groupby()


Unnamed: 0,ID,Age,Gender,Income,Education,Region
12,13,54,Female,20833,Masters,East
829,830,43,Female,42535,High School,North
267,268,62,Female,23901,Masters,South
849,850,21,Female,81531,High School,West
890,891,64,Female,54431,Bachelors,North


In [32]:
# 2.Stratified 
# We are taking samples using two methods
print('##  Method 2 using train_test_split ##')
print()
df_train, df_test = train_test_split(df, test_size= 100, stratify=df['Gender'])
statified_sample_2 = df_test.head()
statified_sample_2

##  Method 2 using train_test_split ##



Unnamed: 0,ID,Age,Gender,Income,Education,Region
15,16,42,Other,22073,Masters,West
306,307,47,Male,40747,PhD,West
738,739,48,Other,39319,PhD,East
870,871,76,Female,23962,Bachelors,South
343,344,34,Female,34609,Bachelors,North


In [36]:
# 3. Cluster
main_clusters = list(df['Education'].unique())
print(main_clusters)
selected_cluster = np.random.choice(main_clusters, size = 2, replace= True)
cluster_sample = df.loc[df['Education'].isin(selected_cluster), :]
cluster_sample.head()

['Masters', 'PhD', 'Bachelors', 'High School']


Unnamed: 0,ID,Age,Gender,Income,Education,Region
2,3,71,Male,138887,Bachelors,East
3,4,18,Other,121643,Bachelors,South
5,6,77,Male,142869,High School,West
7,8,57,Female,140974,High School,North
11,12,68,Other,102293,Bachelors,West


In [40]:
# 4. Systemic 
Systematic_sample = df.iloc[::10,:]
Systematic_sample.head()

Unnamed: 0,ID,Age,Gender,Income,Education,Region
0,1,62,Other,96138,Masters,East
10,11,39,Male,25241,Masters,West
20,21,56,Other,121272,PhD,South
30,31,27,Female,58115,High School,East
40,41,53,Female,147482,Bachelors,North


In [42]:
print('Average of income from orignal dataset')
print(df['Income'].mean())
print()
print('Average income of from Randon sample')
print(random_sample['Income'].mean())
print()
print('Average income of from Statified sample method1')
print(statified_sample_1['Income'].mean())
print()
print('Average income of from Statified sample method2')
print(statified_sample_2['Income'].mean())
print()
print('Average income of from Clustered sample')
print(cluster_sample['Income'].mean())
print()
print('Average income of from Systematic sample')
print(Systematic_sample['Income'].mean())

Average of income from orignal dataset
85147.959

Average income of from Randon sample
89539.67

Average income of from Statified sample method1
80816.16

Average income of from Statified sample method2
32142.0

Average income of from Clustered sample
84277.49052631579

Average income of from Systematic sample
88961.4
