In [None]:
!pip install scanpy

Collecting scanpy
  Downloading scanpy-1.11.1-py3-none-any.whl.metadata (9.9 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.11.4-py3-none-any.whl.metadata (9.3 kB)
Collecting legacy-api-wrap>=1.4 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting scikit-learn<1.6.0,>=1.1 (from scanpy)
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.1.2-py3-none-any.whl.metadata (2.5 kB)
Collecting array-api-compat!=1.5,>1.4 (from anndata>=0.8->scanpy)
  Downloading array_api_compat-1.12.0-py3-none-any.whl.metadata (2.5 kB)
Downloading scanpy-1.11.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading anndata-0.11.4-py3-none-any.whl (144 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.

In [None]:
# Block PCA-1: Setup, Mount Drive, Define Paths, and Load Preprocessed Data

import os
import scanpy as sc
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA # For PCA
from sklearn.preprocessing import StandardScaler # For scaling data before PCA
import matplotlib.pyplot as plt # For any potential plotting
import seaborn as sns # For any potential plotting
from google.colab import drive

# --- Mount Google Drive ---
drive.mount('/content/drive', force_remount=True)
print("Google Drive mounted successfully.")

# --- Define Base Paths ---
gdrive_base_path = '/content/drive/My Drive/CMML_ICA2/'

# Input data paths (from your preprocessing output for Seurat/PCA)
preprocessed_for_s_pca_dir = os.path.join(gdrive_base_path, 'data_pbmc10k_mtx/processed/pbmc10k_for_SeuratPCA_preprocessed/')
rna_mtx_input_dir = os.path.join(preprocessed_for_s_pca_dir, 'rna_hvg_counts_mtx/')
protein_csv_input_path = os.path.join(preprocessed_for_s_pca_dir, 'protein_counts_raw.csv')
metadata_csv_input_path = os.path.join(preprocessed_for_s_pca_dir, 'cell_metadata_filtered.csv')

# Output directory for PCA results
pca_output_dir = os.path.join(gdrive_base_path, 'pca_pbmc_model/')
pca_figure_dir = os.path.join(pca_output_dir, 'figures/') # Subfolder for PCA specific figures
pca_data_dir = os.path.join(pca_output_dir, 'data/')     # Subfolder for PCA data outputs

os.makedirs(pca_output_dir, exist_ok=True)
os.makedirs(pca_figure_dir, exist_ok=True)
os.makedirs(pca_data_dir, exist_ok=True)

print(f"Input RNA MTX directory: {rna_mtx_input_dir}")
print(f"Input Protein CSV path: {protein_csv_input_path}")
print(f"Input Metadata CSV path: {metadata_csv_input_path}")
print(f"PCA output main directory: {pca_output_dir}")
print(f"PCA figures will be saved to: {pca_figure_dir}")
print(f"PCA data outputs will be saved to: {pca_data_dir}")


Mounted at /content/drive
Google Drive mounted successfully.
Input RNA MTX directory: /content/drive/My Drive/CMML_ICA2/data_pbmc10k_mtx/processed/pbmc10k_for_SeuratPCA_preprocessed/rna_hvg_counts_mtx/
Input Protein CSV path: /content/drive/My Drive/CMML_ICA2/data_pbmc10k_mtx/processed/pbmc10k_for_SeuratPCA_preprocessed/protein_counts_raw.csv
Input Metadata CSV path: /content/drive/My Drive/CMML_ICA2/data_pbmc10k_mtx/processed/pbmc10k_for_SeuratPCA_preprocessed/cell_metadata_filtered.csv
PCA output main directory: /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/
PCA figures will be saved to: /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/figures/
PCA data outputs will be saved to: /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/data/


In [None]:
# --- Load Preprocessed Data ---
print("\n--- Loading preprocessed data for PCA ---")

# 1. Load RNA data (HVG raw counts)
if not os.path.isdir(rna_mtx_input_dir):
    raise FileNotFoundError(f"RNA MTX input directory not found: {rna_mtx_input_dir}")
adata_rna_pca = sc.read_10x_mtx(rna_mtx_input_dir, var_names='gene_symbols', cache=False)
adata_rna_pca.var_names_make_unique()
print(f"Loaded RNA data for PCA: {adata_rna_pca.shape[0]} cells x {adata_rna_pca.shape[1]} HVGs")
print(adata_rna_pca)

# 2. Load Protein data (raw counts)
if not os.path.exists(protein_csv_input_path):
    raise FileNotFoundError(f"Protein CSV input file not found: {protein_csv_input_path}")
protein_df_pca = pd.read_csv(protein_csv_input_path, index_col=0) # Assuming first column is cell barcode
print(f"\nLoaded Protein data for PCA: {protein_df_pca.shape[0]} cells x {protein_df_pca.shape[1]} proteins")
print("Protein data head:")
print(protein_df_pca.head())

# 3. Load Cell Metadata (includes ground_truth_cell_type)
if not os.path.exists(metadata_csv_input_path):
    raise FileNotFoundError(f"Metadata CSV input file not found: {metadata_csv_input_path}")
cell_metadata_pca = pd.read_csv(metadata_csv_input_path, index_col=0) # Assuming first column is cell barcode
print(f"\nLoaded Cell Metadata for PCA: {cell_metadata_pca.shape[0]} cells x {cell_metadata_pca.shape[1]} annotations")
print("Cell metadata head:")
print(cell_metadata_pca.head())
if "ground_truth_cell_type" in cell_metadata_pca.columns:
    print("\nGround truth cell type distribution in loaded metadata:")
    print(cell_metadata_pca["ground_truth_cell_type"].value_counts(dropna=False))
else:
    print("WARNING: 'ground_truth_cell_type' not found in loaded metadata!")


# --- Align cells across RNA, Protein, and Metadata ---
# Ensure all DataFrames/AnnData use the same cells in the same order
common_cells = adata_rna_pca.obs_names.intersection(protein_df_pca.index).intersection(cell_metadata_pca.index)
print(f"\nNumber of common cells across RNA, Protein, and Metadata: {len(common_cells)}")

if len(common_cells) < adata_rna_pca.n_obs or \
   len(common_cells) < protein_df_pca.shape[0] or \
   len(common_cells) < cell_metadata_pca.shape[0]:
    print("Warning: Not all cells are common across loaded RNA, Protein, and Metadata. Subsetting to common cells.")

adata_rna_pca = adata_rna_pca[common_cells, :].copy()
protein_df_pca = protein_df_pca.loc[common_cells, :].copy()
cell_metadata_pca = cell_metadata_pca.loc[common_cells, :].copy()

# Add metadata to adata_rna_pca.obs (this will be our main object for PCA workflow)
adata_rna_pca.obs = cell_metadata_pca.copy() # Overwrite obs with the full metadata

print(f"\nData aligned. Final shape for adata_rna_pca: {adata_rna_pca.shape}")
print("adata_rna_pca.obs head after merging metadata:")
print(adata_rna_pca.obs.head())


# Set Scanpy settings for this notebook
sc.settings.verbosity = 3
sc.settings.figdir = pca_figure_dir # Save PCA figures here
sc.set_figure_params(dpi=100, frameon=False, figsize=(5, 5), facecolor='white')

print("\nBlock PCA-1 finished: Setup, Paths, and Data Loading.")


--- Loading preprocessed data for PCA ---
Loaded RNA data for PCA: 7611 cells x 4000 HVGs
AnnData object with n_obs × n_vars = 7611 × 4000
    var: 'gene_ids', 'feature_types'

Loaded Protein data for PCA: 7611 cells x 17 proteins
Protein data head:
                    CD3_TotalSeqB  CD4_TotalSeqB  CD8a_TotalSeqB  \
AAACCCACATCGGTTA-1           30.0          119.0            19.0   
AAACCCAGTACCGCGT-1           18.0          207.0            10.0   
AAACCCAGTATCGAAA-1           18.0           11.0            17.0   
AAACCCAGTCGTCATA-1            5.0           14.0            14.0   
AAACCCAGTCTACACA-1           21.0         1014.0            29.0   

                    CD14_TotalSeqB  CD15_TotalSeqB  CD16_TotalSeqB  \
AAACCCACATCGGTTA-1           472.0           102.0           155.0   
AAACCCAGTACCGCGT-1          1289.0           128.0            72.0   
AAACCCAGTATCGAAA-1            20.0           124.0          1227.0   
AAACCCAGTCGTCATA-1            19.0           156.0          

In [None]:
# Block PCA-2: Data Normalization, Scaling, and Concatenation

print("\n--- Normalizing and Scaling Data for PCA ---")

# --- Normalize and Scale RNA data (HVG raw counts) ---
# We'll use standard Scanpy normalization for RNA before PCA
# 1. Store raw counts if needed (though adata_rna_pca.X should be raw from read_10x_mtx)
adata_rna_pca.layers["counts"] = adata_rna_pca.X.copy()
# 2. Normalize per cell and log1p transform
sc.pp.normalize_total(adata_rna_pca, target_sum=1e4)
sc.pp.log1p(adata_rna_pca)
# 3. Scale RNA data (to unit variance and zero mean)
sc.pp.scale(adata_rna_pca, max_value=10) # max_value clips extreme values
print("RNA data normalized and scaled.")
print(adata_rna_pca)

# --- Normalize and Scale Protein data ---
# For protein data, CLR normalization is common, followed by scaling.
# Or, simple log1p and scaling can be a basic approach for PCA.
# Let's try log1p and scaling for simplicity here, similar to how totalVI Colab treats protein for some plots.
# If using CLR: Seurat does this, or you'd implement it.
# For a basic PCA baseline, log(X+1) then scale is often sufficient.

# Convert protein_df_pca to an AnnData object for easier processing with Scanpy
adata_protein_pca = sc.AnnData(X=protein_df_pca.values,
                               obs=pd.DataFrame(index=protein_df_pca.index),
                               var=pd.DataFrame(index=protein_df_pca.columns))
adata_protein_pca.var_names_make_unique() # Just in case protein names had issues

print("\nProtein AnnData for PCA processing (adata_protein_pca):")
print(adata_protein_pca)

# 1. Log1p transform protein counts
sc.pp.log1p(adata_protein_pca)
# 2. Scale Protein data
sc.pp.scale(adata_protein_pca, max_value=10)
print("Protein data log1p transformed and scaled.")
print(adata_protein_pca)


# --- Concatenate Scaled RNA (HVGs) and Scaled Protein features ---
# Ensure cell order is the same (should be if using common_cells)
if not adata_rna_pca.obs_names.equals(adata_protein_pca.obs_names):
    raise ValueError("Cell order mismatch between RNA and Protein AnnData objects before concatenation!")

# Get the scaled data matrices
scaled_rna_matrix = adata_rna_pca.X # This is now scaled HVG RNA data
scaled_protein_matrix = adata_protein_pca.X # This is now scaled protein data

# Concatenate horizontally (features)
# Ensure they are numpy arrays for hstack
rna_data_for_concat = scaled_rna_matrix.toarray() if hasattr(scaled_rna_matrix, "toarray") else np.array(scaled_rna_matrix)
protein_data_for_concat = scaled_protein_matrix.toarray() if hasattr(scaled_protein_matrix, "toarray") else np.array(scaled_protein_matrix)

concatenated_features = np.hstack((rna_data_for_concat, protein_data_for_concat))
print(f"\nShape of concatenated RNA and Protein features: {concatenated_features.shape}")
# Expected: (n_common_cells, n_hvg_genes + n_proteins)



--- Normalizing and Scaling Data for PCA ---
normalizing counts per cell
    finished (0:00:00)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
RNA data normalized and scaled.
AnnData object with n_obs × n_vars = 7611 × 4000
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_protein_counts', 'n_proteins', 'ground_truth_cell_type'
    var: 'gene_ids', 'feature_types', 'mean', 'std'
    uns: 'log1p'
    layers: 'counts'

Protein AnnData for PCA processing (adata_protein_pca):
AnnData object with n_obs × n_vars = 7611 × 17
Protein data log1p transformed and scaled.
AnnData object with n_obs × n_vars = 7611 × 17
    var: 'mean', 'std'
    uns: 'log1p'

Shape of concatenated RNA and Protein features: (7611, 4017)


In [None]:
# --- (Optional but Recommended) Scale the *concatenated* matrix again ---
# This helps to give features from different modalities (now with different scales even after individual scaling)
# a more equal footing before PCA.
print("\nScaling the concatenated feature matrix...")
scaler_concatenated = StandardScaler(with_mean=True) # Scale to zero mean and unit variance
scaled_concatenated_features = scaler_concatenated.fit_transform(concatenated_features)
print("Concatenated matrix scaled.")


# Store this scaled concatenated matrix in a new AnnData object or directly use for PCA
# For simplicity, we'll create a new AnnData for the concatenated data
adata_concatenated_pca = sc.AnnData(X=scaled_concatenated_features,
                                    obs=adata_rna_pca.obs.copy()) # Use the obs from adata_rna_pca (which has metadata)
# Create combined feature names for var_names
concatenated_var_names = adata_rna_pca.var_names.tolist() + [f"ADT_{p}" for p in adata_protein_pca.var_names.tolist()]
adata_concatenated_pca.var_names = concatenated_var_names
adata_concatenated_pca.var_names_make_unique()


print("\nAnnData object for PCA (adata_concatenated_pca):")
print(adata_concatenated_pca)

print("\nBlock PCA-2 finished: Data Normalization, Scaling, and Concatenation.")


Scaling the concatenated feature matrix...
Concatenated matrix scaled.

AnnData object for PCA (adata_concatenated_pca):
AnnData object with n_obs × n_vars = 7611 × 4017
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_protein_counts', 'n_proteins', 'ground_truth_cell_type'

Block PCA-2 finished: Data Normalization, Scaling, and Concatenation.


In [None]:
!pip3 install igraph
!pip3 install leidenalg

Collecting leidenalg
  Downloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: leidenalg
Successfully installed leidenalg-0.10.2


In [None]:
# Block PCA-3: Perform PCA, UMAP, and Clustering

print("\n--- Performing PCA, UMAP, and Clustering ---")

# --- Perform PCA ---
# Decide on the number of principal components
n_pcs_pca = 50 # A common number, can be tuned based on elbow plot or variance explained
print(f"Performing PCA and keeping top {n_pcs_pca} components...")

# sklearn's PCA expects samples x features (which adata_concatenated_pca.X is)
pca_operator = PCA(n_components=n_pcs_pca, random_state=0)
adata_concatenated_pca.obsm['X_pca'] = pca_operator.fit_transform(adata_concatenated_pca.X) # .X is the scaled concatenated data

# (Optional) Plot PCA variance ratio to help choose n_pcs
# pca_operator.explained_variance_ratio_
# plt.plot(np.cumsum(pca_operator.explained_variance_ratio_))
# plt.xlabel('Number of components')
# plt.ylabel('Cumulative explained variance')
# plt.show()

print("PCA performed. Latent space stored in adata_concatenated_pca.obsm['X_pca']")
print(f"Shape of X_pca: {adata_concatenated_pca.obsm['X_pca'].shape}")


# --- UMAP based on PCA latent space ---
print("\nCalculating UMAP based on PCA embedding...")
sc.pp.neighbors(adata_concatenated_pca, n_pcs=n_pcs_pca, use_rep='X_pca', key_added="neighbors_pca")
sc.tl.umap(adata_concatenated_pca, neighbors_key="neighbors_pca", min_dist=0.3) # Stores in .obsm['X_umap']
print("UMAP calculated and stored in .obsm['X_umap']")


# --- Leiden Clustering based on PCA latent space (using the same neighbors graph as UMAP) ---
pca_leiden_resolution = 0.5 # Choose a resolution, can be tuned
pca_clusters_key = "leiden_pca"
print(f"\nPerforming Leiden clustering with resolution {pca_leiden_resolution}...")
sc.tl.leiden(adata_concatenated_pca, resolution=pca_leiden_resolution, neighbors_key="neighbors_pca", key_added=pca_clusters_key)
print(f"Leiden clustering performed. Results in .obs['{pca_clusters_key}']")
print("Cluster distribution:")
print(adata_concatenated_pca.obs[pca_clusters_key].value_counts())


# --- Save the AnnData object with PCA, UMAP, and Leiden results ---
adata_pca_results_path = os.path.join(pca_data_dir, "pbmc10k_pca_results.h5ad") # Save in pca_output_dir/data/
adata_concatenated_pca.write_h5ad(adata_pca_results_path, compression="lzf")
print(f"AnnData with PCA results saved to: {adata_pca_results_path}")

print("\nBlock PCA-3 finished: PCA, UMAP, and Clustering.")
print(adata_concatenated_pca)


--- Performing PCA, UMAP, and Clustering ---
Performing PCA and keeping top 50 components...
PCA performed. Latent space stored in adata_concatenated_pca.obsm['X_pca']
Shape of X_pca: (7611, 50)

Calculating UMAP based on PCA embedding...
computing neighbors
    finished: added to `.uns['neighbors_pca']`
    `.obsp['neighbors_pca_distances']`, distances for each pair of neighbors
    `.obsp['neighbors_pca_connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm)
    'umap', UMAP parameters (adata.uns) (0:00:31)
UMAP calculated and stored in .obsm['X_umap']

Performing Leiden clustering with resolution 0.5...
running Leiden clustering



 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata_concatenated_pca, resolution=pca_leiden_resolution, neighbors_key="neighbors_pca", key_added=pca_clusters_key)


    finished: found 11 clusters and added
    'leiden_pca', the cluster labels (adata.obs, categorical) (0:00:04)
Leiden clustering performed. Results in .obs['leiden_pca']
Cluster distribution:
leiden_pca
0     1886
1     1267
2     1188
3      923
4      585
5      570
6      423
7      410
8      138
9      113
10     108
Name: count, dtype: int64
AnnData with PCA results saved to: /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/data/pbmc10k_pca_results.h5ad

Block PCA-3 finished: PCA, UMAP, and Clustering.
AnnData object with n_obs × n_vars = 7611 × 4017
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_protein_counts', 'n_proteins', 'ground_truth_cell_type', 'leiden_pca'
    uns: 'neighbors_pca', 'umap', 'leiden_pca'
    obsm: 'X_pca', 'X_umap'
    obsp: 'neighbors_pca_distances', 'neighbors_pca_connectivities'


In [None]:
# Block PCA-4: Visualize PCA Results and Save Outputs for Benchmark

print("\n--- Visualizing PCA Integration Results ---")

# --- 1. UMAP colored by Ground Truth Cell Type ---
if "ground_truth_cell_type" in adata_concatenated_pca.obs and "X_umap" in adata_concatenated_pca.obsm:
    umap_pca_gt_path = os.path.join(pca_figure_dir, "pca_umap_by_ground_truth.png")
    plt.figure(figsize=(8, 7))
    sc.pl.umap(
        adata_concatenated_pca,
        color="ground_truth_cell_type",
        title="PCA UMAP by Ground Truth Cell Type",
        show=False, frameon=False,
        legend_loc="on data" if adata_concatenated_pca.obs["ground_truth_cell_type"].nunique() < 28 else "right margin",
        legend_fontsize=8
    )
    plt.savefig(umap_pca_gt_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"PCA UMAP by Ground Truth saved to {umap_pca_gt_path}")
else:
    print("Skipping PCA UMAP by Ground Truth: missing 'ground_truth_cell_type' in obs or 'X_umap' in obsm.")

# --- 2. UMAP colored by PCA Leiden Clusters ---
if pca_clusters_key in adata_concatenated_pca.obs and "X_umap" in adata_concatenated_pca.obsm:
    umap_pca_leiden_path = os.path.join(pca_figure_dir, "pca_umap_by_leiden.png")
    plt.figure(figsize=(8, 7))
    sc.pl.umap(
        adata_concatenated_pca,
        color=pca_clusters_key,
        title=f"PCA UMAP by Leiden (res={pca_leiden_resolution})",
        show=False, frameon=False,
        legend_loc="on data", legend_fontsize=8
    )
    plt.savefig(umap_pca_leiden_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"PCA UMAP by Leiden clusters saved to {umap_pca_leiden_path}")
else:
    print("Skipping PCA UMAP by Leiden: missing Leiden clusters in obs or 'X_umap' in obsm.")


# --- Save Core Outputs for Benchmark ---
print("\n--- Saving PCA Core Outputs for Benchmark ---")

# 1. PCA Latent Space (X_pca)
if "X_pca" in adata_concatenated_pca.obsm:
    pca_latent_space_df = pd.DataFrame(adata_concatenated_pca.obsm["X_pca"], index=adata_concatenated_pca.obs_names)
    pca_latent_save_path = os.path.join(pca_data_dir, "pca_latent_space.csv")
    pca_latent_space_df.to_csv(pca_latent_save_path)
    print(f"PCA latent space saved to: {pca_latent_save_path}")
else:
    print("PCA latent space ('X_pca') not found. Cannot save.")


# 2. PCA Leiden Cluster Labels
if pca_clusters_key in adata_concatenated_pca.obs:
    pca_leiden_labels_df = pd.DataFrame({
        "cell_barcode": adata_concatenated_pca.obs_names,
        pca_clusters_key: adata_concatenated_pca.obs[pca_clusters_key]
    })
    pca_leiden_labels_save_path = os.path.join(pca_data_dir, "pca_leiden_labels.csv")
    pca_leiden_labels_df.to_csv(pca_leiden_labels_save_path, index=False)
    print(f"PCA Leiden cluster labels saved to: {pca_leiden_labels_save_path}")
else:
    print(f"PCA Leiden clusters ('{pca_clusters_key}') not found. Cannot save labels.")


# 3. Expression matrices for RNA-Protein Correlation
#    For PCA, the "integrated" view is the concatenated scaled matrix *before* PCA,
#    or more commonly, people use the original (but normalized/scaled) individual modalities
#    when calculating correlations *after* clustering based on PCA.
#    Let's save the individually normalized/scaled RNA (HVGs) and Protein matrices.

#    Scaled RNA HVG data was in adata_rna_pca.X
if 'adata_rna_pca' in locals() and hasattr(adata_rna_pca, 'X'):
    scaled_rna_hvg_df = pd.DataFrame(
        adata_rna_pca.X.toarray() if hasattr(adata_rna_pca.X, "toarray") else adata_rna_pca.X, # Ensure dense for CSV
        index=adata_rna_pca.obs_names,
        columns=adata_rna_pca.var_names
    )
    scaled_rna_hvg_save_path = os.path.join(pca_data_dir, "pca_scaled_rna_hvg.csv")
    scaled_rna_hvg_df.to_csv(scaled_rna_hvg_save_path)
    print(f"Scaled RNA HVG data (for PCA context) saved to: {scaled_rna_hvg_save_path}")
else:
    print("adata_rna_pca or its .X not found. Cannot save scaled RNA.")

#    Scaled Protein data was in adata_protein_pca.X
if 'adata_protein_pca' in locals() and hasattr(adata_protein_pca, 'X'):
    scaled_protein_df = pd.DataFrame(
        adata_protein_pca.X.toarray() if hasattr(adata_protein_pca.X, "toarray") else adata_protein_pca.X, # Ensure dense
        index=adata_protein_pca.obs_names,
        columns=adata_protein_pca.var_names
    )
    scaled_protein_save_path = os.path.join(pca_data_dir, "pca_scaled_protein.csv")
    scaled_protein_df.to_csv(scaled_protein_save_path)
    print(f"Scaled Protein data (for PCA context) saved to: {scaled_protein_save_path}")
else:
    print("adata_protein_pca or its .X not found. Cannot save scaled Protein.")


print("\nBlock PCA-4 finished: PCA Visualization and Benchmark Output Saving.")


--- Visualizing PCA Integration Results ---
PCA UMAP by Ground Truth saved to /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/figures/pca_umap_by_ground_truth.png
PCA UMAP by Leiden clusters saved to /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/figures/pca_umap_by_leiden.png

--- Saving PCA Core Outputs for Benchmark ---
PCA latent space saved to: /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/data/pca_latent_space.csv
PCA Leiden cluster labels saved to: /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/data/pca_leiden_labels.csv
Scaled RNA HVG data (for PCA context) saved to: /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/data/pca_scaled_rna_hvg.csv
Scaled Protein data (for PCA context) saved to: /content/drive/My Drive/CMML_ICA2/pca_pbmc_model/data/pca_scaled_protein.csv

Block PCA-4 finished: PCA Visualization and Benchmark Output Saving.


<Figure size 800x700 with 0 Axes>

<Figure size 800x700 with 0 Axes>