# 09 â€” Multivariate Analysis
**Author:** Ebenezer Adjartey

Covers: PCA, Exploratory Factor Analysis (EFA), k-means and hierarchical clustering, Linear Discriminant Analysis (LDA), MANOVA, Canonical Correlation Analysis.

In [None]:
import os
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import silhouette_score
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
sns.set_theme(style='whitegrid')
print('Libraries loaded.')

## 1. Synthetic Dataset (8 Variables)

In [None]:
n = 200
# Create correlated variables with 3 underlying factors
f1 = np.random.normal(0,1,n)  # verbal ability
f2 = np.random.normal(0,1,n)  # quantitative ability
f3 = np.random.normal(0,1,n)  # memory

X = np.column_stack([
    0.8*f1 + 0.1*f2 + np.random.normal(0,.3,n),  # reading
    0.7*f1 + 0.2*f2 + np.random.normal(0,.3,n),  # vocabulary
    0.1*f1 + 0.8*f2 + np.random.normal(0,.3,n),  # math
    0.2*f1 + 0.9*f2 + np.random.normal(0,.3,n),  # statistics
    0.1*f1 + 0.1*f2 + 0.8*f3 + np.random.normal(0,.3,n),  # memory1
    0.2*f1 + 0.1*f2 + 0.7*f3 + np.random.normal(0,.3,n),  # memory2
    np.random.normal(0,1,n),  # noise1
    np.random.normal(0,1,n),  # noise2
])
cols = ['reading','vocabulary','math','statistics','memory1','memory2','noise1','noise2']
df   = pd.DataFrame(X, columns=cols)

# True group labels for LDA/MANOVA
group = pd.cut(f1 + f2, bins=3, labels=[0,1,2]).astype(int)
df['group'] = group

print(df.head())
print('\nCorrelation matrix:')
print(df[cols].corr().round(2))

## 2. Principal Component Analysis (PCA)

In [None]:
# Standardize
scaler = StandardScaler()
X_sc = scaler.fit_transform(df[cols])

# PCA
pca = PCA()
pca.fit(X_sc)
explained = pca.explained_variance_ratio_
cumulative = np.cumsum(explained)

ev_df = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(explained))],
    'Eigenvalue':        pca.explained_variance_.round(4),
    'Variance_%':        (explained*100).round(2),
    'Cumulative_%':      (cumulative*100).round(2)
})
print('Explained Variance Table:')
print(ev_df.to_string(index=False))

n_components = np.argmax(cumulative >= 0.80) + 1
print(f'\nComponents needed for 80% variance: {n_components}')

# Loadings
loadings = pd.DataFrame(pca.components_[:4].T, index=cols,
                         columns=[f'PC{i+1}' for i in range(4)]).round(3)
print('\nPC Loadings (first 4 PCs):')
print(loadings)

In [None]:
# PCA visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Scree plot
axes[0].bar(range(1,9), pca.explained_variance_, color='steelblue')
axes[0].axhline(1, color='red', linestyle='--', label='Kaiser criterion (eigenvalue=1)')
axes[0].set_title('Scree Plot'); axes[0].set_xlabel('PC'); axes[0].set_ylabel('Eigenvalue')
axes[0].legend(fontsize=8)

# Biplot
pcs = pca.transform(X_sc)
axes[1].scatter(pcs[:,0], pcs[:,1], c=df['group'], cmap='Set1', alpha=.5, s=20)
for i, col in enumerate(cols):
    axes[1].arrow(0,0, pca.components_[0,i]*2, pca.components_[1,i]*2,
                  head_width=.05, color='red', alpha=.7)
    axes[1].text(pca.components_[0,i]*2.1, pca.components_[1,i]*2.1, col, fontsize=7)
axes[1].set_title('PCA Biplot (PC1 vs PC2)')
axes[1].set_xlabel(f'PC1 ({explained[0]*100:.1f}%)')
axes[1].set_ylabel(f'PC2 ({explained[1]*100:.1f}%)')

# Cumulative variance
axes[2].plot(range(1,9), cumulative*100, 'bo-', lw=2)
axes[2].axhline(80, color='red', linestyle='--', label='80% threshold')
axes[2].set_title('Cumulative Explained Variance')
axes[2].set_xlabel('Number of PCs'); axes[2].set_ylabel('%')
axes[2].legend()

plt.tight_layout()
os.makedirs('09_multivariate_analysis', exist_ok=True)
plt.savefig('09_multivariate_analysis/pca_plots.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## 3. Exploratory Factor Analysis (EFA)

In [None]:
# Factor Analysis with 3 factors
fa = FactorAnalysis(n_components=3, random_state=42)
fa.fit(X_sc)

fa_loadings = pd.DataFrame(fa.components_.T, index=cols,
                             columns=[f'Factor{i+1}' for i in range(3)]).round(3)
print('Factor Loadings (|loading| > 0.4 = strong):')
print(fa_loadings)

# Uniqueness (1 - communality)
communality = 1 - fa.noise_variance_
print('\nCommunalities (proportion of variance explained):')
for col, comm in zip(cols, communality):
    print(f'  {col:12}: {comm:.4f}')

## 4. K-Means Clustering

In [None]:
# Determine optimal k using elbow + silhouette
inertias, sil_scores = [], []
k_range = range(2, 9)
for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_sc)
    inertias.append(km.inertia_)
    sil_scores.append(silhouette_score(X_sc, labels))

best_k = list(k_range)[np.argmax(sil_scores)]
print(f'Best k by silhouette score: {best_k}')

# Final k-means with best k
km_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
df['km_cluster'] = km_final.fit_predict(X_sc)
print('\nCluster sizes:')
print(df['km_cluster'].value_counts().sort_index())

print('\nCluster centers (standardized):')
centers = pd.DataFrame(km_final.cluster_centers_, columns=cols).round(3)
print(centers)

## 5. Hierarchical Clustering

In [None]:
# Ward linkage dendrogram (on subset of 50 obs for clarity)
Z = linkage(X_sc[:50], method='ward')

fig, ax = plt.subplots(figsize=(12, 5))
dendrogram(Z, ax=ax, truncate_mode='lastp', p=15,
            show_leaf_counts=True, leaf_rotation=90)
ax.set_title('Hierarchical Clustering Dendrogram (Ward)')
ax.set_xlabel('Observations'); ax.set_ylabel('Distance')
plt.tight_layout()
plt.savefig('09_multivariate_analysis/dendrogram.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## 6. Linear Discriminant Analysis (LDA)

In [None]:
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X_sc, df['group'])

print('LDA Explained variance ratio:', lda.explained_variance_ratio_.round(4))
print('Coefficients (LD1, LD2):')
coef_df = pd.DataFrame(lda.coef_.T, index=cols,
                         columns=[f'LD{i+1}' for i in range(lda.coef_.shape[0])]).round(4)
print(coef_df)

# LDA plot
fig, ax = plt.subplots(figsize=(7, 5))
scatter = ax.scatter(X_lda[:,0], X_lda[:,1], c=df['group'], cmap='Set1', alpha=.6, s=30)
ax.set_title('LDA: First Two Discriminant Functions')
ax.set_xlabel('LD1'); ax.set_ylabel('LD2')
plt.colorbar(scatter, label='Group')
plt.tight_layout()
plt.savefig('09_multivariate_analysis/lda_plot.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## 7. MANOVA

In [None]:
from statsmodels.multivariate.manova import MANOVA

df_manova = df[['reading','vocabulary','math','statistics','group']].copy()
manova = MANOVA.from_formula('reading + vocabulary + math + statistics ~ C(group)',
                               data=df_manova)
result = manova.mv_test()
print('MANOVA Results:')
print(result)

## Key Takeaways

- **PCA**: reduces dimensionality; components are uncorrelated linear combinations
- **EFA**: identifies latent factors; loadings show variable-factor relationships
- **K-means**: minimizes within-cluster variance; use elbow/silhouette to choose k
- **Hierarchical**: no need to pre-specify k; dendrogram shows structure
- **LDA**: maximizes between-group vs within-group variance; good for classification
- **MANOVA**: multivariate extension of ANOVA for multiple dependent variables
