# Biol 359A | Principal component analysis

### Spring 2025, Week 9

Objectives:
1. Understand what PCA is and why it's useful in biology
2. Learn how to perform PCA on synthetic data
3. Interpret PCA results and visualizations
4. Apply PCA to real biological datasets
5. Understand common applications in bioinformatics

In [None]:
! rm -r week9_principalComponentAnalysis/
!git clone https://github.com/BIOL359A-FoundationsOfQBio-Spr25/week9_principalComponentAnalysis.git
! cp -r week9_principalComponentAnalysis/* .
! ls

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from mpl_toolkits.mplot3d import Axes3D
import warnings
from ipywidgets import interact
warnings.filterwarnings('ignore')
# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Revist the penguin dataset!

A team of researchers did collect a bunch of data about penguins! The [Palmer Penguin dataset](https://allisonhorst.github.io/palmerpenguins/) was collected by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) at the [Palmer Research Station](https://pallter.marine.rutgers.edu/) in Antarctica. The dataset is commonly used for teaching data analysis techniques.

In [None]:
url = "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/inst/extdata/penguins.csv"

# Use Pandas to read the CSV file directly from the URL
penguins_df = pd.read_csv(url)

In [None]:
# Load penguins dataset if not already loaded
penguins_df = sns.load_dataset("penguins")

# Drop rows with missing values
penguins_cleaned = penguins_df.dropna()
trait_names = penguins_cleaned.select_dtypes(include='number').columns

# Define the interactive plotting function
@interact(x_feature=trait_names,
          y_feature=trait_names)
def plot_scatter(x_feature, y_feature):
    x = penguins_cleaned[x_feature]
    y = penguins_cleaned[y_feature]

    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    correlation_matrix = penguins_cleaned[trait_names].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, square=True)
    plt.title('Correlation Matrix of Traits')

    plt.subplot(1, 2, 2)
    plt.scatter(x, y, color='blue')
    plt.title(f'Scatter plot of selected features')
    plt.xlabel(f'{x_feature}')
    plt.ylabel(f'{y_feature}')
    plt.grid(True)
    plt.show()

### Perform PCA on the penguin dataset

In [None]:
# Step 1: Prepare the data
X = penguins_cleaned[trait_names].values
print(f"Original data shape: {X.shape}")

# Step 2: Standardize the data (important for PCA!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Data standardized (mean=0, std=1 for each feature)")
print(f"Standarized mean = {np.mean(X_scaled):2f}, std = {np.std(X_scaled):2f}")

# Step 3: Apply PCA
pca = PCA(n_components=4) # I know it is trivial to perform PCA a 4D vector into 4D dimension
X_pca = pca.fit_transform(X_scaled)

# Step 4: Examine the results
print(f"\nPCA Results:")
print(f"Number of components: {pca.n_components_}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.round(3)}")
print(f"Cumulative explained variance: {pca.explained_variance_ratio_.cumsum().round(3)}")

# Visualize explained variance
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), 
        pca.explained_variance_ratio_, alpha=0.7, color='skyblue')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance by Component')
plt.xticks(range(1, len(pca.explained_variance_ratio_) + 1))

plt.subplot(1, 2, 2)
plt.plot(range(1, len(pca.explained_variance_ratio_.cumsum()) + 1), 
         pca.explained_variance_ratio_.cumsum(), 'o-', color='red')
plt.axhline(y=0.8, color='gray', linestyle='--', alpha=0.7, label='80% threshold')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Interpret the PCA result

In [None]:
df_pca = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
df_pca['species'] = penguins_cleaned['species']

# Plot PCA results
plt.figure(figsize=(18, 6))
plt.subplot(1, 2, 1)
# Plot data points
colors = ["red", "blue", "green"]
species_colors_dict = {species: color for (species, color) in zip(penguins_cleaned['species'].unique(), colors)}

for species in df_pca['species'].unique():
    mask = df_pca['species'] == species
    plt.scatter(df_pca.loc[mask, 'PC1'], df_pca.loc[mask, 'PC2'], 
               c=species_colors_dict[species], alpha=0.6, label=species)

# Plot loading vectors
scale_factor = 3
for i, trait in enumerate(trait_names):
    plt.arrow(0, 0, loadings.iloc[i, 0] * scale_factor, loadings.iloc[i, 1] * scale_factor,
             head_width=0.1, head_length=0.1, fc='black', ec='black', alpha=0.8)
    modified_text = trait.replace('_mm', '').replace('_g', '').replace('_', ' ')
    plt.text(loadings.iloc[i, 0] * scale_factor * 1.15, 
             loadings.iloc[i, 1] * scale_factor * 1.15, 
             modified_text, ha='center', va='center', fontsize=12)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
plt.title('PCA Biplot\n(Data + Loading Vectors)')
plt.legend()
plt.grid(True, alpha=0.7)

# 3D plot for first three components
ax = plt.subplot(1, 2, 2, projection='3d')
for species in df_pca['species'].unique():
    mask = df_pca['species'] == species
    ax.scatter(df_pca.loc[mask, 'PC1'], df_pca.loc[mask, 'PC2'], df_pca.loc[mask, 'PC3'],
              c=species_colors_dict[species], alpha=0.6, label=species)
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
ax.set_zlabel(f'PC3 ({pca.explained_variance_ratio_[2]:.1%})')
ax.set_title('3D PCA Plot')
ax.legend()

plt.tight_layout()
plt.subplots_adjust(left=0.05, right=0.90, wspace=0.01)

## Real breast cancer data from the[ Wisconsin Diagnostic Breast Cancer Database (WDBC)](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic).

In [None]:
import clean_data

cancer_dataset = clean_data.generate_clean_dataframe()
cancer_dataset

In [None]:
import ipywidgets as widgets
cancer_dataset_reset = cancer_dataset.reset_index()
cancer_dataset_reset.columns = ['ID', 'diagnosis'] + list(cancer_dataset_reset.columns[2:])

def plot_column(column):
    # Define colors for consistent use in plots
    colors = {'B': 'skyblue', 'M': 'salmon'}

    # Calculate summary statistics for each diagnosis
    ordered_diagnoses = ['M', 'B']
    diagnoses_abbr_name = {'M':'malignant', 'B': 'benign'}
    stats_df = cancer_dataset_reset.groupby('diagnosis')[column].describe().reindex(ordered_diagnoses)

    # Plotting
    plt.figure(figsize=(10, 4))

    # Plot boxplot
    sns.boxplot(y='diagnosis', x=column, hue='diagnosis', data=cancer_dataset_reset, orient='h', palette=colors, dodge=False)
    plt.title(f'Box plot of {column} by Diagnosis')
    plt.ylabel('Diagnosis')
    plt.xlabel(column)

    # Plot the mean as a black diamond ('D')
    means = cancer_dataset_reset.groupby('diagnosis')[column].mean().reindex(ordered_diagnoses)
    for diagnosis, mean in means.items():
        y_pos = ordered_diagnoses.index(diagnosis)
        label = f'Mean {diagnoses_abbr_name[diagnosis]}'
        plt.plot(mean, y_pos, 'D', color='black', markersize=3, label=label)

    # Display summary statistics table below the boxplot
    table_ax = plt.table(cellText=stats_df.round(2).values,
                         colLabels=stats_df.columns,
                         rowLabels=stats_df.index,
                         cellLoc='center', rowLoc='center',
                         loc='bottom', bbox=[0, -0.5, 1, 0.3])
    plt.legend()
    plt.subplots_adjust(left=0.2, bottom=0.2)

    plt.show()

# Create a dropdown menu for selecting the column to plot
dropdown_columns = widgets.Dropdown(
    options=[col for col in cancer_dataset_reset.columns if col not in ['ID', 'diagnosis']],
    description='Column:',
    disabled=False,
)

# Display the dropdown and plot the selected column
_ = widgets.interact(plot_column, column=dropdown_columns)


### Perform PCA on the breast cancer dataset

In [None]:
# Prepare data
feature_names = cancer_dataset_reset.select_dtypes(include='number').columns
X_cancer = cancer_dataset_reset[feature_names].values
y_cancer = cancer_dataset_reset['diagnosis'].values

# Standardize
scaler = StandardScaler()
X_cancer_scaled = scaler.fit_transform(X_cancer)

# Apply PCA
pca_cancer = PCA()
X_cancer_pca = pca_cancer.fit_transform(X_cancer_scaled)
df_cancer_pca = pd.DataFrame(X_cancer_pca[:, :2], columns=['PC1', 'PC2'])
df_cancer_pca['diagnosis'] = y_cancer
print("PCA Results for Iris Dataset:")
print(f"Explained variance ratio: {pca_cancer.explained_variance_ratio_.round(3)}")
print(f"Cumulative explained variance: {pca_cancer.explained_variance_ratio_.cumsum().round(3)}")

# How many components needed for 95% variance?
cumvar = pca_cancer.explained_variance_ratio_.cumsum()
n_components_95 = np.argmax(cumvar >= 0.95) + 1
print(f"Components needed for 95% variance: {n_components_95}")

loadings_cancer = pd.DataFrame(
    pca_cancer.components_.T,
    columns=[f'PC{i+1}' for i in range(pca_cancer.n_components_)],
    index=feature_names
)

# Visualize Iris PCA results
plt.figure(figsize=(16, 4))

# Explained variance
plt.subplot(1, 4, 1)
plt.bar(range(1, len(pca_cancer.explained_variance_ratio_) + 1), 
        pca_cancer.explained_variance_ratio_, alpha=0.7)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance - Cancer Dataset')

# Cumulative variance
plt.subplot(1, 4, 2)
plt.plot(range(1, len(cumvar) + 1), cumvar, 'o-')
plt.axhline(y=0.95, color='red', linestyle='--', alpha=0.7, label='95%')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance')
plt.legend()
plt.grid(True, alpha=0.3)

diag_colors_dict = {"B": "blue", "M": "red"}
plt.subplot(1, 4, 3)
# Plot points
for species in ["M", "B"]:
    mask = df_cancer_pca['diagnosis'] == species
    plt.scatter(df_cancer_pca.loc[mask, 'PC1'], df_cancer_pca.loc[mask, 'PC2'], 
               c=diag_colors_dict[species], alpha=0.6, label=species)

# Plot loading vectors
if 0:
  scale = 2.5
  for i, feature in enumerate(feature_names):
      plt.arrow(0, 0, loadings_cancer.iloc[i, 0] * scale, loadings_cancer.iloc[i, 1] * scale,
              head_width=0.1, head_length=0.1, fc='black', ec='black', alpha=0.8)
      plt.text(loadings_cancer.iloc[i, 0] * scale * 1.1, 
              loadings_cancer.iloc[i, 1] * scale * 1.1, 
              feature.replace(' (cm)', '').replace(' ', '\n'), 
              ha='center', va='center', fontsize=8)

plt.xlabel(f'PC1 ({pca_cancer.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'PC2 ({pca_cancer.explained_variance_ratio_[1]:.1%})')
plt.title('Iris PCA Biplot')
plt.legend()
plt.grid(True, alpha=0.3)

# PC1 vs PC3
plt.subplot(1, 4, 4)
for species in ["M", "B"]:
    mask = df_cancer_pca['diagnosis'] == species
    plt.scatter(X_cancer_pca[mask, 0], X_cancer_pca[mask, 2], 
               c=diag_colors_dict[species], alpha=0.7, label=species, s=50)

plt.xlabel(f'PC1 ({pca_cancer.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'PC3 ({pca_cancer.explained_variance_ratio_[2]:.1%})')
plt.title('PC1 vs PC3')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
print("\nComponent Loadings:")
print(loadings_cancer.round(3))