In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest

# Q1: z-test (one-tailed)

An e-commerce company claims their average delivery time is 5 days. You want to test if the actual delivery time is more than this claim.

1. Load the shipping dataset
2. Formulate hypotheses (two-tailed test):
   - $H_0: \mu = 5$ days
   - $H_a: \mu > 5$ days
3. Assume $\sigma = 2$ days (known from historical data)
4. Take a random sample of 50 deliveries
5. Implement both approaches:
   
   **P-value Approach:**
   - Calculate the Z-statistic
   - Calculate the p-value
   - Compare p-value with $\alpha = 0.05$
   - Make decision
   
   **Critical Value Approach:**
   - Determine critical Z-values for $\alpha = 0.05$
   - Compare test statistic with critical values
   - Make decision
   
6. Verify both approaches give the same conclusion.

In [2]:
# Synthetic shipping dataset
path = "https://raw.githubusercontent.com/Armagaan/noc26_cs86/refs/heads/main/data/shipping.csv"
shipping_data = pd.read_csv(path)
shipping_data.head()

Unnamed: 0,order_id,delivery_days
0,1,6
1,2,5
2,3,7
3,4,8
4,5,5


In [3]:
sample_size = len(shipping_data)
population_sigma = 2.0
hypothesized_mean = 5.0
alpha = 0.05

sample_mean = shipping_data.delivery_days.mean()
standard_error = population_sigma / np.sqrt(sample_size)

In [4]:
print("p-value approach")
print("-" * 50)

print(f"alpha: {alpha}")

# calculate the one-tailed p-value
z_stat = (sample_mean - hypothesized_mean) / standard_error
p_value = 1 - stats.norm.cdf(z_stat)
print(f"P-value: {p_value:.4f}")

# Compare the two and decide
decision_pvalue = "Reject H0" if p_value < alpha else "Fail to reject H0"
print(f"DECISION: {decision_pvalue}")

p-value approach
--------------------------------------------------
alpha: 0.05
P-value: 0.0896
DECISION: Fail to reject H0


In [5]:
print("Critical-value approach")
print("-" * 50)

# calculate the z-stat
z_stat = (sample_mean - hypothesized_mean) / standard_error
print(f"Z-statistic: {z_stat:.4f}")

# For two-tailed test, split α between both tails
z_critical = stats.norm.ppf(1 - alpha)
print(f"Z_critical: {z_critical:.4f}")

# Compare and decide
decision_critical = "Reject H0" if z_stat > z_critical else "Fail to reject H0"
print(f"DECISION: {decision_critical}")

Critical-value approach
--------------------------------------------------
Z-statistic: 1.3435
Z_critical: 1.6449
DECISION: Fail to reject H0


# Q2: z-test (two-tailed)

A restaurant manager believes that the average tip percentage has changed from the historical average of 15%.

1. Load the tips dataset
2. Calculate tip percentage (tip/total_bill × 100)
3. Assume historical population σ = 5%
4. Conduct a two-tailed test at α = 0.05:
   - H0: μ = 15%
   - Ha: μ ≠ 15%
5. Implement all three approaches:
   
   **A. P-value Approach:**
   - Calculate Z-statistic
   - Find p-value
   - Make decision
   
   **B. Critical Value Approach:**
   - Find critical values
   - Compare with test statistic
   - Make decision
   
   **C. Confidence Interval Approach:**
   - Construct 95% confidence interval
   - Check if hypothesized mean (15%) falls within interval
   - Make decision
   
6. Verify all three approaches yield identical conclusions

In [6]:
# Load Restaurant Tips Dataset
path = "https://raw.githubusercontent.com/Armagaan/noc26_cs86/refs/heads/main/data/tips.csv"
tips = pd.read_csv(path)
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [7]:
alpha = 0.05
hypothesized_mean = 15.0
population_sigma = 5.0

tips["tip_percentage"] = (tips["tip"] / tips["total_bill"]) * 100

In [8]:
# calculate z_stat
sample_mean = tips["tip_percentage"].mean()
sample_size = len(tips)
standard_error = population_sigma / np.sqrt(sample_size)
z_stat = (sample_mean - hypothesized_mean) / standard_error

# 1. Two-tailed p-value
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
decision_p = "Reject H0" if p_value < alpha else "Fail to reject H0"

# 2. z-critical
z_critical_lower = stats.norm.ppf(alpha / 2)
z_critical_upper = stats.norm.ppf(1 - alpha / 2)
decision_z = "Reject H0" if (z_stat < z_critical_lower or z_stat > z_critical_upper) else "Fail to reject H0"

# 3. 95% confidence interval
ci_margin = z_critical_upper * standard_error
ci_lower = sample_mean - ci_margin
ci_upper = sample_mean + ci_margin
decision_ci = "Reject H0" if (hypothesized_mean < ci_lower or hypothesized_mean > ci_upper) else "Fail to reject H0"

In [9]:
print('-' * 50)
print("APPROACH A: P-VALUE")
print('-' * 50)
print(f"Alpha: {alpha}")
print(f"P-value: {p_value:.4f}")
print(decision_p)

print()

print('-' * 50)
print("APPROACH B: Z-CRITICAL")
print('-' * 50)
print(f"Z-statistic: {z_stat:.4f}")
print(f"Z-critical: {{ {z_critical_lower:.4f}, {z_critical_upper:.4f} }}")
print(decision_z)

print()

print('-' * 50)
print("APPROACH B: 95% CONFIDENCE INTERVAL")
print('-' * 50)
print(f"Hypothesized mean: {hypothesized_mean}")
print(f"95% confidence interval: {{ {ci_lower:.4f}, {ci_upper:.4f} }}")
print(decision_ci)

--------------------------------------------------
APPROACH A: P-VALUE
--------------------------------------------------
Alpha: 0.05
P-value: 0.0007
Reject H0

--------------------------------------------------
APPROACH B: Z-CRITICAL
--------------------------------------------------
Z-statistic: 3.3748
Z-critical: { -1.9600, 1.9600 }
Reject H0

--------------------------------------------------
APPROACH B: 95% CONFIDENCE INTERVAL
--------------------------------------------------
Hypothesized mean: 15.0
95% confidence interval: { 15.4529, 16.7076 }
Reject H0


# Q3: t-test

A school claims that the average math score is 70. You want to test this claim but don't know the population standard deviation.

1. Load the student performance dataset
2. Extract math scores
3. Since σ is unknown, use the T-test
4. Set up hypotheses (two-tailed):
   - H0: μ = 70
   - Ha: μ ≠ 70
5. For different sample sizes (n = 50, 100, 200):
   
   **Part A: Implement T-test manually**
   - Calculate sample mean and sample standard deviation
   - Calculate t-statistic: t = (x̄ - μ₀) / (s / √n)
   - Determine degrees of freedom (n - 1)
   - Find critical t-values
   - Calculate p-value
   - Make decision
   
   **Part B: Use scipy.stats.ttest_1samp**
   - Perform the test using Python
   - Compare results with manual calculation

In [10]:
path = "https://raw.githubusercontent.com/Armagaan/noc26_cs86/refs/heads/main/data/StudentsPerformance.csv"
students = pd.read_csv(path)
students

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [11]:
math_scores = students["math score"]

alpha = 0.05
hypothesized_mean = 70
sample_sizes = [50, 100, 200]

In [12]:
print("PART A: MANUAL T-TEST CALCULATION")
for sample_size in sample_sizes:
    print()
    print('-' * 50)
    print(f"Sample size: {sample_size}")
    print('-' * 50)

    sample = math_scores.sample(n=sample_size, random_state=7)

    sample_mean = sample.mean()
    sample_std = sample.std(ddof=1) # HW: What is ddof?
    standard_error = sample_std / np.sqrt(sample_size)
    df = len(sample) - 1

    # print(f"Sample mean: {sample_mean:.4f}")
    # print(f"Sample std dev: {sample_std:.4f}")
    # print(f"Degrees of freedom (df): {df}")
    # print(f"Standard error: {standard_error:.4f}")

    t_statistic = (sample_mean - hypothesized_mean) / (standard_error)
    t_critical_lower = stats.t.ppf(alpha/2, df)
    t_critical_upper = stats.t.ppf(1 - alpha/2, df)

    print(f"\nT-statistic = {t_statistic:.4f}")
    print(f"Critical t-values (alpha={alpha}, df={df}):")
    print(f"Lower: {t_critical_lower:.4f}")
    print(f"Upper: {t_critical_upper:.4f}")

    p_value = 2 * (1 - stats.t.cdf(abs(t_statistic), df=df))
    print(f"\nP-value: {p_value:.4f}")

    decision_manual = "REJECT H0" if p_value < alpha else "FAIL TO REJECT H0"
    print(f"Decision: {decision_manual}")

PART A: MANUAL T-TEST CALCULATION

--------------------------------------------------
Sample size: 50
--------------------------------------------------

T-statistic = -0.7198
Critical t-values (alpha=0.05, df=49):
Lower: -2.0096
Upper: 2.0096

P-value: 0.4750
Decision: FAIL TO REJECT H0

--------------------------------------------------
Sample size: 100
--------------------------------------------------

T-statistic = -1.6531
Critical t-values (alpha=0.05, df=99):
Lower: -1.9842
Upper: 1.9842

P-value: 0.1015
Decision: FAIL TO REJECT H0

--------------------------------------------------
Sample size: 200
--------------------------------------------------

T-statistic = -2.2796
Critical t-values (alpha=0.05, df=199):
Lower: -1.9720
Upper: 1.9720

P-value: 0.0237
Decision: REJECT H0


In [13]:
print("PART B: LIBRARY T-TEST CALCULATION")
for sample_size in sample_sizes:
    print()
    print('-' * 50)
    print(f"Sample size: {sample_size}")
    print('-' * 50)

    sample = math_scores.sample(n=sample_size, random_state=7)

    t_stat_scipy, p_value_scipy = stats.ttest_1samp(sample, hypothesized_mean)
    print(f"T-statistic (scipy): {t_stat_scipy:.4f}")
    print(f"P-value (scipy): {p_value_scipy:.4f}")

    decision_scipy = "REJECT H0" if p_value_scipy < alpha else "FAIL TO REJECT H0"
    print(f"Decision: {decision_scipy}")

PART B: LIBRARY T-TEST CALCULATION

--------------------------------------------------
Sample size: 50
--------------------------------------------------
T-statistic (scipy): -0.7198
P-value (scipy): 0.4750
Decision: FAIL TO REJECT H0

--------------------------------------------------
Sample size: 100
--------------------------------------------------
T-statistic (scipy): -1.6531
P-value (scipy): 0.1015
Decision: FAIL TO REJECT H0

--------------------------------------------------
Sample size: 200
--------------------------------------------------
T-statistic (scipy): -2.2796
P-value (scipy): 0.0237
Decision: REJECT H0


# Q4: Proportion

A telecom company claims that their customer churn rate is 20%, i.e., 20% of its customers stopped using its service. You want to verify this claim.

- Load the customer churn dataset
- Calculate the sample proportion of churned customers
- Set up hypotheses:
    - H0: p = 0.20
    - Ha: p > 0.20
- Check assumptions:
    - Verify np₀ ≥ 5 and n(1-p₀) ≥ 5
- Calculate the test statistic:
    - Z = (p̂ - p₀) / √[p₀(1-p₀)/n]
    - Calculate p-value for two-tailed test
    - Make decision at α = 0.05
- Using statsmodels:
    - Implement using proportions_ztest from statsmodels
    - Compare with manual calculations


In [14]:
# Load Customer Churn Dataset
# For demonstration, we'll create synthetic churn data
np.random.seed(42)

n_customers = 500
actual_churn_rate = 0.25  # True rate different from claimed

churn_data = pd.DataFrame({
    'customer_id': range(1, n_customers + 1),
    'churned': np.random.binomial(1, actual_churn_rate, n_customers)
})

print(f"{'=' * 70}")
print("DATASET INFORMATION")
print(f"{'=' * 70}")
print(f"Total customers: {len(churn_data)}")
print(f"Churned customers: {churn_data['churned'].sum()}")
print(f"Sample churn rate: {churn_data['churned'].mean():.4f}")

DATASET INFORMATION
Total customers: 500
Churned customers: 131
Sample churn rate: 0.2620


In [15]:
p_0 = 0.20
p_hat = churn_data.churned.mean()
n = len(churn_data)

# check normality assumption
if n * p_0 >= 5 and n * (1 - p_0) >= 5:
    print("Normal approximation holds")

Normal approximation holds


In [16]:
standard_error = np.sqrt((p_0 * (1 - p_0) / n))
z_statistic = (p_hat - p_0) / standard_error

p_value = 1 - stats.norm.cdf(z_statistic)

decision_pvalue = "REJECT H0" if p_value < alpha else "FAIL TO REJECT H0"

print(f"Z-statistic: {z_statistic:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Decision: {decision_pvalue}")

Z-statistic: 3.4659
P-value: 0.0003
Decision: REJECT H0


In [17]:
from statsmodels.stats.proportion import proportions_ztest

# proportions_ztest needs: count of successes, nobs, value
count_churned = churn_data['churned'].sum()
z_stat_sm, p_value_sm = proportions_ztest(
    count=count_churned,
    nobs=n,
    value=p_0,
    alternative='larger',
    prop_var=p_0, # Important
)

print(f"Z-statistic (statsmodels): {z_stat_sm:.4f}")
print(f"P-value (statsmodels): {p_value_sm:.4f}")

decision_sm = "REJECT H0" if p_value_sm < alpha else "FAIL TO REJECT H0"
print(f"Decision: {decision_sm}")

Z-statistic (statsmodels): 3.4659
P-value (statsmodels): 0.0003
Decision: REJECT H0


# Q5: Errors

You are a healthcare analyst investigating whether the average medical cost for smokers is greater than USD 30,000.

1. Load the insurance dataset and filter for smokers only
2. Assume population $\sigma = 10,000$
3. Set up hypotheses:
   - $H_0 : \mu \leq 30000$ (average cost is at most USD 30,000)
   - $H_a : \mu > 30000$ (average cost is greater than USD 30,000)
4. Make a decision based on the p-value using $\alpha = 0.05$
6. Simulation study to understand Type I and Type II errors:
   - Scenario A: Assume the true population mean is actually USD 30,000 ($H_0$ is true)
     - Draw 1000 samples and conduct tests
     - Count how many times you incorrectly reject $H_0$ (Type I error)
     - Compare with theoretical $\alpha$
   - Scenario B: Assume the true population mean is actually USD 35,000 ($H_a$ is true)
     - Draw 1000 samples and conduct tests
     - Count how many times you incorrectly fail to reject $H_0$ (Type II error)
     - Calculate the power of the test ($1 - \beta$): The probability that the test statistic falls in the rejection region when the alternative hypothesis is true. Here, $\beta$ is the type II error rate.
   - Repeat scenario B theoretically.


In [18]:
# Load Medical Cost Personal Dataset
# Downloaded from: https://www.kaggle.com/mirichoi0218/insurance
path_insurance = "https://raw.githubusercontent.com/Armagaan/noc26_cs86/refs/heads/main/data/insurance.csv"
insurance = pd.read_csv(path_insurance)

insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [19]:
# Filter the charges of smokers.
mask = (insurance['smoker'] == 'yes')
smokers = insurance[mask]['charges']

# Known parameters
population_sigma = 10_000
hypothesized_mean = 30_000
sample_size = len(smokers)

In [20]:
# Calculate sample statistics
sample_mean = smokers.mean()
standard_error = population_sigma / np.sqrt(sample_size)
z_statistic = (sample_mean - hypothesized_mean) / standard_error

print("SAMPLE STATISTICS")
print('-' * 70)
print(f"Sample size: {sample_size}")
print(f"Sample mean: ${sample_mean:,.2f}")
print(f"Standard error: ${standard_error:,.2f}")
print(f"Z-statistic: {z_statistic:.4f}")

SAMPLE STATISTICS
----------------------------------------------------------------------
Sample size: 274
Sample mean: $32,050.23
Standard error: $604.12
Z-statistic: 3.3937


In [21]:
# Critical value for upper-tailed test
z_critical = stats.norm.ppf(1 - alpha)

# P-value for upper-tailed test
p_value = 1 - stats.norm.cdf(z_statistic)

# Decision
decision = "Reject H0" if p_value < alpha else "Fail to reject H0"
print(decision)

Reject H0


In [22]:
# SIMULATION STUDY FOR TYPE I AND TYPE II ERRORS
n_simulations = 1000
sample_size_sim = 50

In [23]:

# SCENARIO A: H0 is TRUE (μ = 30000)
# This tests Type I error
np.random.seed(7)
print(f"SCENARIO A: Type I Error (H0 is True)")
print(f"True population mean = $30,000")
print(f"Running {n_simulations} simulations...\n")

type_i_errors = 0

z_crit = stats.norm.ppf(1 - alpha)

for sim in range(n_simulations):
    # Generate sample from population where μ = 30000 (H0 is true)
    sample = np.random.normal(
        loc=hypothesized_mean,
        scale=population_sigma,
        size=sample_size_sim
    )

    # Calculate test statistic
    sample_mean_sim = sample.mean()
    se = population_sigma / np.sqrt(sample_size_sim)
    z_stat = (sample_mean_sim - hypothesized_mean) / se

    # Check if we incorrectly reject H0
    if z_stat > z_crit:
        type_i_errors += 1

error_rate = type_i_errors / n_simulations

print(f"{'Alpha':<10} {'Type I Errors':<20} {'Type I Error Rate':<20} {'Expected Rate':<20}")
print('-' * 70)
print(f"{alpha:<10.2f} {type_i_errors:<20} {error_rate:<20.4f} {alpha:<20.2f}")

SCENARIO A: Type I Error (H0 is True)
True population mean = $30,000
Running 1000 simulations...

Alpha      Type I Errors        Type I Error Rate    Expected Rate       
----------------------------------------------------------------------
0.05       51                   0.0510               0.05                


In [24]:
# SCENARIO B: Ha is TRUE (μ = 35000)
# This tests Type II error and Power
print(f"SCENARIO B: Type II Error and Power (Ha is True)")
np.random.seed(7)
print(f"True population mean = $35,000")
print(f"Running {n_simulations} simulations...\n")

type_ii_errors = 0
power = 0

true_mean_under_ha = 35000

z_crit = stats.norm.ppf(1 - alpha)

for sim in range(n_simulations):
    # Generate sample from population where μ = 35000 (Ha is true)
    sample = np.random.normal(
        loc=true_mean_under_ha,
        scale=population_sigma,
        size=sample_size_sim
    )

    # Calculate test statistic
    sample_mean_sim = sample.mean()
    se = population_sigma / np.sqrt(sample_size_sim)
    z_stat = (sample_mean_sim - hypothesized_mean) / se

    # Check if we correctly reject H0
    if z_stat > z_crit:
        power += 1
    else:
        # Incorrectly fail to reject H0 (Type II error)
        type_ii_errors += 1

beta = type_ii_errors / n_simulations
power_val = 1 - beta

print(f"{'Alpha':<10} {'Type II Errors':<20} {'β (Type II Rate)':<20} {'Power (1-β)':<20}")
print('-' * 70)
print(f"{alpha:<10.2f} {type_ii_errors:<20} {beta:<20.4f} {power_val:<20.4f}")


SCENARIO B: Type II Error and Power (Ha is True)
True population mean = $35,000
Running 1000 simulations...

Alpha      Type II Errors       β (Type II Rate)     Power (1-β)         
----------------------------------------------------------------------
0.05       20                   0.0200               0.9800              


In [25]:
def cal_t2_errors(mu_null, mu_alt, population_sigma, sample_size, alpha):

    z_critical = stats.norm.ppf(1 - alpha)

    standard_error = population_sigma / np.sqrt(sample_size)

    # critical sample mean threshold
    x_bar_crit = mu_null + z_critical * standard_error

    # convert threshold into alternate distribution z-score
    z_alt = (x_bar_crit - mu_alt) / standard_error

    # Type II error
    beta = stats.norm.cdf(z_alt)

    return float(beta)

In [26]:
print(f"SCENARIO B: Theoretical calculation - Type II Error and Power (Ha is True)")
print()

beta = cal_t2_errors(
    mu_null=30_000,
    mu_alt=35_000,
    population_sigma=population_sigma,
    sample_size=50,
    alpha=0.05
)
power = 1 - beta

print(f"{'Alpha':<10} {'β (Type II Rate)':<20} {'Power (1-β)':<20}")
print('-' * 50)
print(f"{alpha:<10.2f} {beta:<20.4f} {power:<20.4f}")

SCENARIO B: Theoretical calculation - Type II Error and Power (Ha is True)

Alpha      β (Type II Rate)     Power (1-β)         
--------------------------------------------------
0.05       0.0293               0.9707              
