# 01 — Descriptive Statistics
**Author:** Ebenezer Adjartey

Covers: central tendency, dispersion, skewness, kurtosis, frequency tables, cross-tabulations, visualizations.

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(42)
sns.set_theme(style='whitegrid')
print('Libraries loaded.')

## 1. Synthetic Dataset

In [None]:
n = 200
df = pd.DataFrame({
    'age':       np.clip(np.random.normal(35,10,n).round().astype(int), 18, 70),
    'income':    np.random.lognormal(10, 0.5, n).round().astype(int),
    'score':     np.random.normal(70, 15, n).round().astype(int),
    'education': np.random.choice(['Primary','Secondary','Tertiary'], n, p=[.2,.4,.4]),
    'gender':    np.random.choice(['Male','Female'], n)
})
print(df.head(6))
print(f'Shape: {df.shape}')

## 2. Measures of Central Tendency

In [None]:
def get_mode(s):
    return s.mode()[0]

for col in ['age','income','score']:
    x = df[col]
    print(f'{col:10} | mean={x.mean():.2f}  median={x.median():.1f}  mode={get_mode(x)}')

## 3. Measures of Dispersion

In [None]:
for col in ['age','income','score']:
    x = df[col]
    iqr = x.quantile(.75) - x.quantile(.25)
    cv  = 100 * x.std() / x.mean()
    print(f'{col:10} | var={x.var():.2f}  sd={x.std():.2f}  range={x.max()-x.min()}  IQR={iqr:.2f}  CV={cv:.2f}%')

print('\nQuantiles:')
print(df[['age','income','score']].quantile([0, .25, .5, .75, 1]).round(2))

## 4. Skewness and Kurtosis

In [None]:
for col in ['age','income','score']:
    x = df[col]
    print(f'{col:10} | skewness={stats.skew(x):.4f}  excess_kurtosis={stats.kurtosis(x):.4f}')

print('\nRule of thumb: |skewness|>1 = highly skewed; excess kurtosis>0 = heavy tails')

## 5. Complete Summary Table

In [None]:
print(df[['age','income','score']].describe().round(2))

## 6. Frequency Table

In [None]:
counts  = df['education'].value_counts()
percents = df['education'].value_counts(normalize=True) * 100
ft = pd.concat([counts, percents.round(1)], axis=1, keys=['Count','Pct%'])
ft['Cum%'] = ft['Pct%'].cumsum().round(1)
print('Education Frequency Table:')
print(ft)

## 7. Cross-Tabulation and Chi-Square Test

In [None]:
ct = pd.crosstab(df['gender'], df['education'])
print('Observed counts:')
print(ct)

print('\nRow percentages:')
print((ct.div(ct.sum(axis=1), axis=0) * 100).round(1))

chi2_stat, p_val, dof, expected = stats.chi2_contingency(ct)
print(f'\nChi-square = {chi2_stat:.4f},  p-value = {p_val:.4f},  df = {dof}')
print('Verdict:', 'Dependent (reject H0)' if p_val < 0.05 else 'Independent (fail to reject H0)')

## 8. Grouped Summary Statistics

In [None]:
grp = (
    df.groupby('education')['score']
    .agg(n='count', mean='mean', std='std', median='median', min='min', max='max')
    .round(2)
)
print('Score by Education:')
print(grp)

## 9. Correlations

In [None]:
print('Pearson:')
print(df[['age','income','score']].corr().round(3))

print('\nSpearman:')
print(df[['age','income','score']].corr(method='spearman').round(3))

## 10. Visualizations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(13, 9))

# Histogram with mean/median
axes[0,0].hist(df['age'], bins=20, color='steelblue', edgecolor='white', alpha=.8)
axes[0,0].axvline(df['age'].mean(),   color='red',   linestyle='--', lw=1.5, label=f"Mean={df['age'].mean():.1f}")
axes[0,0].axvline(df['age'].median(), color='green', linestyle='--', lw=1.5, label=f"Median={df['age'].median():.1f}")
axes[0,0].set_title('Age Distribution'); axes[0,0].set_xlabel('Age'); axes[0,0].legend()

# Boxplot by education
edu_order = ['Primary','Secondary','Tertiary']
data_grp = [df[df['education']==e]['score'].values for e in edu_order]
axes[0,1].boxplot(data_grp, labels=edu_order, patch_artist=True)
axes[0,1].set_title('Score by Education'); axes[0,1].set_ylabel('Score')

# KDE density by gender
for g, grp in df.groupby('gender'):
    grp['score'].plot.kde(ax=axes[1,0], label=g, linewidth=2)
axes[1,0].set_title('Score Density by Gender'); axes[1,0].set_xlabel('Score'); axes[1,0].legend()

# Q-Q plot
stats.probplot(df['score'], dist='norm', plot=axes[1,1])
axes[1,1].set_title('Q-Q Plot: Score vs Normal')

plt.suptitle('Descriptive Statistics Visualizations', fontsize=14)
plt.tight_layout()
os.makedirs('01_descriptive_statistics', exist_ok=True)
plt.savefig('01_descriptive_statistics/descriptive_plots.png', dpi=100, bbox_inches='tight')
plt.show()
print('Saved.')

## Key Takeaways

- **Age**: roughly normal; slight right skew from clipping at 18
- **Income**: log-normally distributed (high right skew, large CV)
- **Score**: approximately normal
- Chi-square test evaluates independence between categorical variables
- Q-Q plot visually checks normality assumption
