# 02 — Probability Distributions
**Author:** Ebenezer Adjartey

Covers: Binomial, Poisson, Geometric, Hypergeometric, Negative Binomial (discrete);
Normal, t, Chi-square, F, Exponential, Gamma, Beta, Weibull, Uniform (continuous);
PDF/PMF, CDF, quantile functions, distribution fitting, goodness-of-fit tests.

In [None]:
import os
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import (binom, poisson, geom, hypergeom, nbinom,
                          norm, t, chi2, f as f_dist, expon, gamma,
                          beta as beta_dist, weibull_min, uniform)
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(42)
sns.set_theme(style='whitegrid')
print('Libraries loaded.')

## Part A: Discrete Distributions

### 1. Binomial Distribution B(n=10, p=0.5)

In [None]:
# Scenario: 10 fair coin flips; X = number of heads
n, p = 10, 0.5
k = np.arange(0, n+1)
tb = pd.DataFrame({
    'k':           k,
    'PMF P(X=k)':  binom.pmf(k, n, p).round(4),
    'CDF P(X<=k)': binom.cdf(k, n, p).round(4)
})
print(tb.to_string(index=False))
print(f'\nMean = {n*p}   Variance = {n*p*(1-p)}')
print(f'P(X <= 5) = {binom.cdf(5, n, p):.4f}')
print(f'P(X >= 7) = {1-binom.cdf(6, n, p):.4f}')
print(f'90th percentile = {binom.ppf(0.90, n, p):.0f}')

### 2. Poisson Distribution (lambda=3)

In [None]:
# Scenario: 3 customers/minute arrive at a shop
lam = 3
k = np.arange(0, 13)
tb = pd.DataFrame({'k': k, 'P(X=k)': poisson.pmf(k, lam).round(4), 'CDF': poisson.cdf(k, lam).round(4)})
print(tb.to_string(index=False))
print(f'\nMean = Variance = {lam}')
print(f'P(X = 0) = {poisson.pmf(0, lam):.4f}')
print(f'P(X > 5) = {1-poisson.cdf(5, lam):.4f}')

### 3. Geometric Distribution (p=0.3)

In [None]:
# Scenario: Repeated trials; X = trial number of first success, P(success)=0.3
p = 0.3
k = np.arange(1, 11)
pmf = geom.pmf(k, p)
print(pd.DataFrame({'k': k, 'P(X=k)': pmf.round(4)}).to_string(index=False))
print(f'\nMean (expected trials until success) = {1/p:.2f}')
print(f'P(X <= 5) = {geom.cdf(5, p):.4f}')

### 4. Hypergeometric Distribution (N=25, K=10, n=8)

In [None]:
# Scenario: Urn with 10 red + 15 blue; draw 8 without replacement
N, K, n = 25, 10, 8
k = np.arange(0, n+1)
pmf = hypergeom.pmf(k, N, K, n)
print(pd.DataFrame({'k': k, 'P(X=k)': pmf.round(4)}).to_string(index=False))
print(f'\nP(exactly 3 red) = {hypergeom.pmf(3, N, K, n):.4f}')
print(f'P(at least 4 red) = {1-hypergeom.cdf(3, N, K, n):.4f}')
print(f'Mean = {n*K/N:.3f}')

### 5. Negative Binomial (r=3, p=0.4)

In [None]:
# Scenario: P(success)=0.4; X = failures before r-th success
r, p = 3, 0.4
k = np.arange(0, 16)
pmf = nbinom.pmf(k, r, p)
print(pd.DataFrame({'failures_k': k, 'P(X=k)': pmf.round(4)}).head(12).to_string(index=False))
print(f'\nP(5 failures before 3rd success) = {nbinom.pmf(5, r, p):.4f}')
print(f'Mean failures before {r} successes = {r*(1-p)/p:.2f}')

## Part B: Continuous Distributions

### 6. Normal Distribution N(100, 15)

In [None]:
mu, sigma = 100, 15
print(f'Normal N(mu={mu}, sigma={sigma})')
print(f'P(X < 115)      = {norm.cdf(115, mu, sigma):.4f}')
print(f'P(85 < X < 115) = {norm.cdf(115,mu,sigma) - norm.cdf(85,mu,sigma):.4f}')
print(f'P(X > 130)      = {1-norm.cdf(130, mu, sigma):.4f}')
print(f'95th percentile = {norm.ppf(0.95, mu, sigma):.2f}')
print(f'z-score for 120 = {(120-mu)/sigma:.3f}')
print('\n68-95-99.7 rule:')
for z in [1, 2, 3]:
    p_val = norm.cdf(z) - norm.cdf(-z)
    print(f'  P(mu +/- {z}*sigma) = {p_val:.4f}')

### 7. Student's t-Distribution

In [None]:
print('Critical values (alpha=0.05, two-tailed):')
for df in [1, 5, 10, 30, 100, 1000]:
    print(f'  df={df:5d}: t_crit = {t.ppf(0.975, df):.4f}')
print('\n(As df increases, t_crit approaches z_crit = 1.9600)')

### 8. Chi-Square Distribution

In [None]:
print('95th percentiles (chi2 critical values at alpha=0.05):')
for df in [1, 2, 3, 5, 10, 20, 30]:
    print(f'  df={df:2d}: chi2_crit = {chi2.ppf(0.95, df):.3f}')

### 9. F-Distribution

In [None]:
print('F critical values at alpha=0.05:')
for d1 in [2, 3, 5]:
    for d2 in [10, 20, 60]:
        print(f'  F({d1:1d},{d2:2d}): F_crit = {f_dist.ppf(0.95, d1, d2):.3f}')

### 10-14. Exponential, Gamma, Beta, Weibull, Uniform

In [None]:
# Exponential(rate=0.5, mean=2)
print('Exponential(rate=0.5, mean=2):')
print(f'  P(X>3)={expon.sf(3,scale=2):.4f}  P(X<1)={expon.cdf(1,scale=2):.4f}  Median={expon.ppf(.5,scale=2):.4f}')
print(f'  Memoryless: P(X>5|X>2) = {expon.sf(5,scale=2)/expon.sf(2,scale=2):.4f} == P(X>3) = {expon.sf(3,scale=2):.4f}')

# Gamma(shape=3, rate=0.5)
print('\nGamma(shape=3, rate=0.5):')
print(f'  Mean={3/0.5}  Var={3/0.5**2}  P(X<5)={gamma.cdf(5,a=3,scale=2):.4f}')

# Beta(alpha=2, beta=5)
print('\nBeta(alpha=2, beta=5):')
print(f'  Mean={2/7:.4f}  P(X<0.3)={beta_dist.cdf(.3,2,5):.4f}  P(.2<X<.5)={beta_dist.cdf(.5,2,5)-beta_dist.cdf(.2,2,5):.4f}')

# Weibull(shape=2, scale=10)
print('\nWeibull(shape=2, scale=10) — failure-time model:')
print(f'  P(failure before t=8)={weibull_min.cdf(8,c=2,scale=10):.4f}  Median={weibull_min.ppf(.5,c=2,scale=10):.4f}')

# Uniform U(0,10)
print('\nUniform U(0,10):')
print(f'  P(2<X<7)={uniform.cdf(7,0,10)-uniform.cdf(2,0,10):.4f}  Mean=5  Var={100/12:.4f}')

## Part C: Distribution Fitting & Goodness-of-Fit Tests

In [None]:
# Generate sample from Gamma(shape=2, scale=2)
sample = gamma.rvs(a=2, scale=2, size=200, random_state=42)
print(f'Sample: n=200, mean={sample.mean():.3f}, sd={sample.std():.3f}')

# Fit Normal
mu_fit, sig_fit = norm.fit(sample)
ll_norm = norm.logpdf(sample, mu_fit, sig_fit).sum()
aic_norm = -2*ll_norm + 2*2
print(f'\nNormal fit: mu={mu_fit:.3f}, sigma={sig_fit:.3f}')
print(f'  Log-likelihood={ll_norm:.2f}  AIC={aic_norm:.2f}')

# Fit Gamma
sh_fit, loc_fit, sc_fit = gamma.fit(sample, floc=0)
ll_gamma = gamma.logpdf(sample, sh_fit, loc_fit, sc_fit).sum()
aic_gamma = -2*ll_gamma + 2*2
print(f'\nGamma fit: shape={sh_fit:.3f}, scale={sc_fit:.3f}')
print(f'  Log-likelihood={ll_gamma:.2f}  AIC={aic_gamma:.2f}')

print(f'\nAIC comparison: Normal={aic_norm:.2f}  Gamma={aic_gamma:.2f}')
print(f'  Winner: {"Normal" if aic_norm < aic_gamma else "Gamma"} (lower AIC = better fit)')

# KS tests
ks_n = stats.kstest(sample, lambda x: norm.cdf(x, mu_fit, sig_fit))
ks_g = stats.kstest(sample, lambda x: gamma.cdf(x, sh_fit, loc_fit, sc_fit))
print(f'\nKS test vs Normal: D={ks_n.statistic:.4f}, p={ks_n.pvalue:.4f}')
print(f'KS test vs Gamma:  D={ks_g.statistic:.4f}, p={ks_g.pvalue:.4f}')
print('(Higher p-value = better fit to distribution)')

# Shapiro-Wilk normality test
sw_stat, sw_p = stats.shapiro(sample[:50])
print(f'\nShapiro-Wilk (n=50): W={sw_stat:.4f}, p={sw_p:.4f}')
print('Verdict:', 'Not normal (reject H0)' if sw_p < 0.05 else 'Cannot reject normality')

## Visualization Gallery

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15, 12))

# Binomial PMF
k = np.arange(0, 11)
axes[0,0].bar(k, binom.pmf(k, 10, .5), color='steelblue', edgecolor='white')
axes[0,0].set_title('Binomial B(10, 0.5)'); axes[0,0].set_xlabel('k'); axes[0,0].set_ylabel('P(X=k)')

# Poisson PMF
k2 = np.arange(0, 13)
axes[0,1].bar(k2, poisson.pmf(k2, 3), color='coral', edgecolor='white')
axes[0,1].set_title('Poisson(lambda=3)'); axes[0,1].set_xlabel('k')

# Normal family
x = np.linspace(-5, 5, 300)
for mu_, s_, lbl in [(0,1,'N(0,1)'), (0,2,'N(0,2)'), (1,1,'N(1,1)')]:
    axes[0,2].plot(x, norm.pdf(x, mu_, s_), label=lbl, lw=2)
axes[0,2].legend(fontsize=8); axes[0,2].set_title('Normal Family')

# t vs Normal
x2 = np.linspace(-4, 4, 300)
axes[1,0].plot(x2, norm.pdf(x2), label='N(0,1)', lw=2)
for df_ in [2, 5, 30]:
    axes[1,0].plot(x2, t.pdf(x2, df_), ls='--', label=f't(df={df_})')
axes[1,0].legend(fontsize=8); axes[1,0].set_title('t vs Normal')

# Chi-square
xc = np.linspace(0, 30, 300)
for df_ in [1, 3, 5, 10]:
    axes[1,1].plot(xc, chi2.pdf(xc, df_), label=f'df={df_}', lw=2)
axes[1,1].set_ylim(0, .5); axes[1,1].legend(fontsize=8); axes[1,1].set_title('Chi-Square')

# Exponential
xe = np.linspace(0, 10, 300)
for r in [0.5, 1, 2]:
    axes[1,2].plot(xe, expon.pdf(xe, scale=1/r), label=f'rate={r}', lw=2)
axes[1,2].legend(fontsize=8); axes[1,2].set_title('Exponential')

# Gamma
xg = np.linspace(0, 20, 300)
for sh_ in [1, 2, 5]:
    axes[2,0].plot(xg, gamma.pdf(xg, a=sh_, scale=2), label=f'shape={sh_}', lw=2)
axes[2,0].legend(fontsize=8); axes[2,0].set_title('Gamma (scale=2)')

# Beta
xb = np.linspace(0.001, 0.999, 300)
for a_, b_ in [(.5,.5), (2,5), (5,2), (2,2)]:
    axes[2,1].plot(xb, beta_dist.pdf(xb, a_, b_), label=f'B({a_},{b_})', lw=2)
axes[2,1].legend(fontsize=8); axes[2,1].set_title('Beta')

# Distribution fitting
xp = np.linspace(sample.min(), sample.max(), 300)
axes[2,2].hist(sample, bins=30, density=True, color='grey', alpha=.7, label='Data')
axes[2,2].plot(xp, norm.pdf(xp, mu_fit, sig_fit), 'r-', lw=2, label='Normal fit')
axes[2,2].plot(xp, gamma.pdf(xp, sh_fit, loc_fit, sc_fit), 'b-', lw=2, label='Gamma fit')
axes[2,2].legend(fontsize=8); axes[2,2].set_title('Distribution Fitting')

plt.suptitle('Probability Distribution Gallery', fontsize=14)
plt.tight_layout()
os.makedirs('02_probability_distributions', exist_ok=True)
plt.savefig('02_probability_distributions/dist_gallery.png', dpi=100, bbox_inches='tight')
plt.show()
print('Gallery saved.')

## Key Takeaways

- **Discrete**: PMF gives point probabilities; CDF gives cumulative probabilities
- **Continuous**: PDF is the density function; area under curve = probability
- **Normal** is central to statistics via the Central Limit Theorem
- **AIC/KS tests** help choose the best-fitting distribution for real data
