In [3]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, ttest_rel, ttest_ind_from_stats

In [4]:
df = pd.read_csv("titanic.csv")
df.shape

(891, 15)

In [5]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [14]:
df.loc[lambda x: x["who"] == "man", "survived"].values

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,

In [7]:
df.groupby(["who"], dropna=False, observed=False).agg(
    **{
        "n": ("who", "size"),
        "survived_rate": ("survived", "mean"),
    }
)

Unnamed: 0_level_0,n,survived_rate
who,Unnamed: 1_level_1,Unnamed: 2_level_1
child,83,0.590361
man,537,0.163873
woman,271,0.756458


In [10]:
# Simulate binary outcomes for two groups
np.random.seed(42)
group1 = np.random.binomial(1, 0.6, 100)  # 60% success rate
group2 = np.random.binomial(1, 0.5, 100)  # 50% success rate

group1

array([1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1], dtype=int32)

In [15]:
# Calculate mean signal (target rate)
mean1 = np.mean(group1)
mean2 = np.mean(group2)
print(f"Group 1 mean: {mean1:.2f}, Group 2 mean: {mean2:.2f}")

Group 1 mean: 0.63, Group 2 mean: 0.51


### t test

Interpretation

This is a test for the null hypothesis that two independent samples have identical average (expected) values.

If the p-value is greater than the common significance level of 0.05  
Then no evidence to reject null hypo of equal mean  
This means no enough evidence to conclude that the mean signal significantly differs

### example 1

In [None]:
# 2. Welch's t-test (does not assume equal variance)
t_stat_welch, p_val_welch = ttest_ind(
    group1, group2, alternative="two-sided", equal_var=False
)
print(f"t={t_stat_welch:.3f}, p={p_val_welch:.3f}")

t=1.718, p=0.087


In [28]:
# 4. t-test from summary statistics (for demonstration)
std1 = np.std(group1, ddof=1)
std2 = np.std(group2, ddof=1)
t_stat_stats, p_val_stats = ttest_ind_from_stats(
    mean1,
    std1,
    len(group1),
    mean2,
    std2,
    len(group2),
    equal_var=False,
    alternative="two-sided",
)
print(f"t={t_stat_stats:.3f}, p={p_val_stats:.3f}")

t=1.718, p=0.087


In [26]:
# 1. Independent t-test (assumes equal variance)
t_stat, p_val = ttest_ind(group1, group2, alternative="two-sided", equal_var=True)
print(f"t={t_stat:.3f}, p={p_val:.3f}")

t=1.718, p=0.087


In [21]:
# 3. Paired t-test (only meaningful if data are paired, here for demonstration)
t_stat_paired, p_val_paired = ttest_rel(group1, group2)
print(f"t={t_stat_paired:.3f}, p={p_val_paired:.3f}")

t=1.830, p=0.070


### example 2

In [34]:
# 2. Welch's t-test (does not assume equal variance)
arr1 = df.loc[lambda x: x["who"] == "man", "survived"].values
arr2 = df.loc[lambda x: x["who"] == "child", "survived"].values

t_stat_welch, p_val_welch = ttest_ind(
    arr1, arr2, alternative="two-sided", equal_var=False
)
print(f"t={t_stat_welch:.3f}, p={p_val_welch:.3f}")

t=-7.534, p=0.000


In [None]:
# 4. t-test from summary statistics (for demonstration)
arr1 = df.loc[lambda x: x["who"] == "man", "survived"].values
arr2 = df.loc[lambda x: x["who"] == "child", "survived"].values

mean1 = np.mean(arr1)
mean2 = np.mean(arr2)
std1 = np.std(arr1, ddof=1)
std2 = np.std(arr2, ddof=1)
t_stat_stats, p_val_stats = ttest_ind_from_stats(
    mean1,
    std1,
    len(arr1),
    mean2,
    std2,
    len(arr2),
    equal_var=False,
    alternative="two-sided",
)
print(f"t={t_stat_stats:.3f}, p={p_val_stats:.3f}")

t=-7.534, p=0.000
