# Mathematics for Data Analysis — Detailed Examples & Step-by-step Implementations

This notebook contains Python code and explanations for the topics covered in the PDF:
- Descriptive statistics
- Probability & Bayes
- Distributions
- Linear Algebra / Normal Equation
- Calculus & Gradient Descent
- Hypothesis Testing (Permutation test)
- PCA (Dimensionality Reduction)


## 1. Descriptive Statistics

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Descriptive statistics example dataset
desc_data = np.array([12, 15, 12, 18, 20, 22, 12], dtype=float)
n = desc_data.size
_sum = desc_data.sum()
mean = _sum / n
median = np.median(desc_data)
(unique, counts) = np.unique(desc_data, return_counts=True)
mode_val = unique[np.argmax(counts)]
variance = ((desc_data - mean)**2).sum() / n
std = np.sqrt(variance)

print("Dataset:", desc_data)
print("n:", n, "Sum:", _sum)
print("Mean:", mean, "Median:", median, "Mode:", mode_val)
print("Variance:", variance, "Std:", std)

plt.hist(desc_data, bins=np.arange(desc_data.min()-0.5, desc_data.max()+1.5))
plt.axvline(mean, linestyle='--', label=f"Mean={mean:.2f}")
plt.axvline(median, linestyle=':', label=f"Median={median:.2f}")
plt.title("Histogram with Mean & Median")
plt.xlabel("Value"); plt.ylabel("Frequency"); plt.legend()
plt.show()

## 2. Probability & Bayes

In [None]:
# Bayes Theorem Example: Medical Test
prev = 0.01
sens = 0.99
spec = 0.95

num = sens * prev
den = sens * prev + (1 - spec) * (1 - prev)
posterior = num / den

print("Posterior probability P(Disease|Positive):", posterior)

import matplotlib.pyplot as plt
plt.bar(['Prior (Prevalence)','Posterior P(D|+)'], [prev, posterior])
plt.title("Prior vs Posterior Probability")
plt.ylabel("Probability")
plt.show()

## 3. Distributions

In [None]:
# Normal Distribution
mu, sigma = 50, 5
samples_normal = np.random.normal(mu, sigma, size=500)

x = np.linspace(mu - 4*sigma, mu + 4*sigma, 200)
pdf = (1/(sigma*np.sqrt(2*np.pi))) * np.exp(-0.5*((x-mu)/sigma)**2)

plt.hist(samples_normal, bins=20, density=True)
plt.plot(x, pdf)
plt.title("Normal Distribution: Sample & PDF")
plt.show()

# Binomial Distribution PMF
from math import comb
n_binom, p_binom = 10, 0.3
k_values = np.arange(0, n_binom+1)
pmf_binom = [comb(n_binom,k)*(p_binom**k)*((1-p_binom)**(n_binom-k)) for k in k_values]

plt.bar(k_values, pmf_binom)
plt.title(f"Binomial PMF (n={n_binom},p={p_binom})")
plt.xlabel("k"); plt.ylabel("P(X=k)")
plt.show()

# Poisson Distribution PMF
lam = 3
k_vals = np.arange(0,11)
poisson_pmf = [np.exp(-lam)*lam**k/np.math.factorial(k) for k in k_vals]

plt.bar(k_vals, poisson_pmf)
plt.title(f"Poisson PMF (lambda={lam})")
plt.xlabel("k"); plt.ylabel("P(X=k)")
plt.show()

## 4. Linear Algebra — Normal Equation

In [None]:
# Linear Regression via Normal Equation
x_vals = np.array([1,2,3,4], dtype=float)
y_vals = np.array([2,3,5,4], dtype=float)

X = np.column_stack((np.ones_like(x_vals), x_vals))
w = np.linalg.inv(X.T @ X) @ X.T @ y_vals

print("Weights (intercept, slope):", w)

plt.scatter(x_vals, y_vals, label='Data')
x_line = np.linspace(x_vals.min(), x_vals.max(), 100)
y_line = w[0] + w[1]*x_line
plt.plot(x_line, y_line, label='Fit', color='orange')
plt.title("Linear Regression Fit (Normal Equation)")
plt.xlabel("x"); plt.ylabel("y"); plt.legend()
plt.show()

## 5. Calculus & Optimization — Gradient Descent

In [None]:
# Gradient Descent for Linear Regression
alpha = 0.1
params = np.array([0.0,0.0])
m = x_vals.size
losses = []

for i in range(20):
    preds = params[0] + params[1]*x_vals
    error = preds - y_vals
    loss = (error**2).mean()
    grad0 = 2*error.mean()
    grad1 = 2*(error*x_vals).mean()
    params -= alpha*np.array([grad0,grad1])
    losses.append(loss)

print("Final params after 20 iterations:", params)

plt.plot(range(1,21), losses, marker='o')
plt.title("Gradient Descent Loss")
plt.xlabel("Iteration"); plt.ylabel("MSE Loss")
plt.show()

## 6. Hypothesis Testing — Permutation Test

In [None]:
# Permutation Test
np.random.seed(42)
group_A = np.array([85,90,88,75,95], dtype=float)
group_B = np.array([80,78,85,82,79], dtype=float)
obs_diff = group_A.mean() - group_B.mean()

combined = np.concatenate([group_A, group_B])
n_perms = 1000
diffs = np.zeros(n_perms)
for i in range(n_perms):
    perm = np.random.permutation(combined)
    a = perm[:group_A.size]
    b = perm[group_A.size:]
    diffs[i] = a.mean() - b.mean()

p_value = (np.abs(diffs) >= np.abs(obs_diff)).mean()
print("Observed difference:", obs_diff, "p-value:", p_value)

plt.hist(diffs, bins=30)
plt.axvline(obs_diff, linestyle='--', color='red', label='Observed diff')
plt.title("Permutation Test Distribution")
plt.xlabel("Mean difference"); plt.ylabel("Frequency")
plt.legend()
plt.show()

## 7. PCA — Dimensionality Reduction

In [None]:
# PCA Example
pca_data = np.array([[2.5,2.4],[0.5,0.7],[2.2,2.9],[1.9,2.2],[3.1,3.0],
                     [2.3,2.7],[2,1.6],[1,1.1],[1.5,1.6],[1.1,0.9]])
mean_vec = pca_data.mean(axis=0)
centered = pca_data - mean_vec
cov = np.cov(centered.T, bias=False)
eigvals, eigvecs = np.linalg.eig(cov)
order = np.argsort(eigvals)[::-1]
eigvals = eigvals[order]
eigvecs = eigvecs[:,order]
print("Eigenvalues:", eigvals)
print("Eigenvectors:", eigvecs)

plt.scatter(centered[:,0], centered[:,1])
pc1 = eigvecs[:,0]
plt.arrow(0,0,pc1[0]*2,pc1[1]*2,head_width=0.05,color='red')
plt.title("PCA: Centered Data & 1st Principal Component")
plt.xlabel("X1 centered"); plt.ylabel("X2 centered")
plt.show()