# PCA

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>).
- **Affiliation:** Helmholtz Munich, Institute of Computational Biology (ICB), Talavera-López Lab
- **Date of Creation:** 13.01.2023
- **Date of Last Modification:** 22.02.2023

- Run PCA on log normalized counts to get a baseline latent representation.

## 1. Setup

### 1.1 Import Libraries

In [None]:
import time
from datetime import datetime

import os
import scanpy as sc
import squidpy as sq
import matplotlib.pyplot as plt
import numpy as np

### 1.2 Define Parameters

In [None]:
model_name = "pca"
latent_key = f"{model_name}_latent"
n_pcs = 50 # Number of principal components to construct latent nearest neighbor graph
leiden_resolution = 0.5 # used for Leiden clustering of latent space
random_seed = 0 # used for Leiden clustering

### 1.3 Run Notebook Setup

In [None]:
sc.set_figure_params(figsize=(6, 6))

In [None]:
# Get time of notebook execution for timestamping saved artifacts
now = datetime.now()
current_timestamp = now.strftime("%d%m%Y_%H%M%S")

### 1.4 Configure Paths and Directories

In [None]:
data_folder_path = "../../../datasets/srt_data/gold/"
figure_folder_path = f"../../../figures"

## 2. Log Normalized Gene Expression PCA

### 2.1 Define Run Function

In [None]:
def run_pca(dataset,
            cell_type_key,
            n_runs=10,
            n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16, 20, 20]):
    # Configure figure folder path
    dataset_figure_folder_path = f"{figure_folder_path}/{dataset}/method_benchmarking/" \
                                 f"{model_name}/{current_timestamp}"
    os.makedirs(dataset_figure_folder_path, exist_ok=True)
    
    # Create original adata to store results from training runs
    adata_original = sc.read_h5ad(data_folder_path + f"{dataset}.h5ad")
    
    for run_number, n_neighbors in zip(np.arange(1, n_runs+1), n_neighbor_list):
        # n_neighbors is here only used for the latent neighbor graph construction used for
        # UMAP generation and clustering as PCA is not a spatial method
        
        # Load data
        adata = sc.read_h5ad(data_folder_path + f"{dataset}.h5ad")
        
        start_time = time.time()
            
        sc.tl.pca(adata)
        
        # Measure time for model training
        end_time = time.time()
        elapsed_time = end_time - start_time
        hours, rem = divmod(elapsed_time, 3600)
        minutes, seconds = divmod(rem, 60)
        print(f"Duration of model training in run {run_number}: "
              f"{int(hours)} hours, {int(minutes)} minutes and {int(seconds)} seconds.")
        adata_original.uns[f"{model_name}_model_training_duration_run{run_number}"] = (
            elapsed_time)
        
        sc.pp.neighbors(adata,
                        n_pcs=n_pcs,
                        n_neighbors=n_neighbors)
        sc.tl.umap(adata)
        fig = sc.pl.umap(adata,
                         color=[cell_type_key],
                         title="Latent Space with Cell Types: Log Normalized Counts PCA",
                         return_fig=True)
        fig.savefig(f"{dataset_figure_folder_path}/latent_log_normalized_counts_pca"
                    f"_cell_types_run{run_number}.png",
                    bbox_inches="tight")

        # Compute latent Leiden clustering
        sc.tl.leiden(adata=adata,
                     resolution=leiden_resolution,
                     random_state=random_seed,
                     key_added=f"latent_log_normalized_counts_pca_leiden_{str(leiden_resolution)}")

        # Create subplot of latent Leiden cluster annotations in physical and latent space
        fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(6, 12))
        title = fig.suptitle(t="Latent and Physical Space with Leiden Clusters: Log Normalized Counts PCA")
        sc.pl.umap(adata=adata,
                   color=[f"latent_log_normalized_counts_pca_leiden_{str(leiden_resolution)}"],
                   title=f"Latent Space with Leiden Clusters",
                   ax=axs[0],
                   show=False)
        sq.pl.spatial_scatter(adata=adata,
                              color=[f"latent_log_normalized_counts_pca_leiden_{str(leiden_resolution)}"],
                              title=f"Physical Space with Leiden Clusters",
                              shape=None,
                              ax=axs[1])

        # Create and position shared legend
        handles, labels = axs[0].get_legend_handles_labels()
        lgd = fig.legend(handles, labels, bbox_to_anchor=(1.25, 0.9185))
        axs[0].get_legend().remove()
        axs[1].get_legend().remove()

        # Adjust, save and display plot
        plt.subplots_adjust(wspace=0, hspace=0.2)
        fig.savefig(f"{dataset_figure_folder_path}/latent_physical_comparison_"
                    f"log_normalized_counts_pca_run{run_number}.png",
                    bbox_extra_artists=(lgd, title),
                    bbox_inches="tight")
        plt.show()

        # Store X_pca as latent representation
        adata_original.obsm[latent_key + f"_run{run_number}"] = adata.obsm["X_pca"]

        # Label all 'gene programs' as active gene programs for subsequent benchmarking
        adata_original.uns[f"pca_active_gp_names_run{run_number}"] = (
            np.array([f"latent_{i}" for i in range(adata_original.obsm[f"{latent_key}_run{run_number}"].shape[1])]))

        # Store intermediate adata to disk
        adata_original.write(f"{data_folder_path}/{dataset}_{model_name}.h5ad")

    # Store final adata to disk
    adata_original.write(f"{data_folder_path}/{dataset}_{model_name}.h5ad")   

### 2.2 Run PCA on Benchmarking Datasets

In [None]:
run_pca(dataset="seqfish_mouse_organogenesis_embryo2",
        cell_type_key="celltype_mapped_refined")

In [None]:
run_pca(dataset="vizgen_merfish_mouse_liver",
        cell_type_key="Cell_Type",
        n_runs=1,
        n_neighbor_list=[20])

In [None]:
run_pca(dataset="starmap_plus_mouse_cns",
        cell_type_key="Main_molecular_cell_type")

In [None]:
run_pca(dataset="nanostring_cosmx_human_nsclc",
        cell_type_key="cell_type")