# This script loads in the publicly available tabula sapien anndata object with gene expression which also contains cell metadata...

### The main purpose is to subset the number of junction regtools output files we obtained from raw data (WAY more raw data/BAM files than final number reported in their paper and the publicly available gene expression Anndata file)

### Final number ends up being 40884 cells with metadata/gene expression reported in paper and splice junction data

In [1]:
import os

# Get the current working directory
current_dir = os.getcwd()
print("Current working directory:", current_dir)

import pandas as pd
import scanpy as sc
import anndata as ad
from tqdm import tqdm
import matplotlib.pyplot as plt # import matplotlib to visualize our qc metrics
import subprocess
import sys
import seaborn as sns
import numpy as np
from scipy.sparse import csr_matrix
import scanpy.external as sce
from sklearn.metrics import silhouette_score
import datetime
import numpy as np
import harmonypy as hm
import scanorama
import gffutils
from collections import defaultdict
import scipy.sparse as sp
from collections import defaultdict
import re

Current working directory: /gpfs/commons/home/kisaev/Leaflet-analysis/Human_Splicing_Foundation/metadata


In [2]:
# load in tabula sapien data 
# tabsap_adata = sc.read_h5ad("/gpfs/commons/datasets/controlled/CZI/tabula-sapiens/TS_figshare/TabulaSapiens.h5ad") # original tabula sapiens 
print(f"Now reading in Tabula Sapiens V2 data...")
tabsap_adata = sc.read_h5ad("/gpfs/commons/datasets/controlled/CZI/tabula-sapiens/TabulaSapiens_v2/GeneExpressionMatrices/merged_tabula_sapiens.h5ad")
tabsap_adata.obs["dataset"] = "tabula_sapiens"
tabsap_adata.layers["raw_counts"] = tabsap_adata.X.copy()

Now reading in Tabula Sapiens V2 data...


In [3]:
# save tabula sapien metadata to easily link back with BAM files and junction files 
output_dir = "/gpfs/commons/datasets/controlled/CZI/tabula-sapiens/TabulaSapiens_v2/"
ts_metadata = tabsap_adata.obs[["old_index", "donor", "tissue", "anatomical_position", "library_plate", "cdna_well", "assay"]]
# save metadata to file
ts_metadata.to_csv(os.path.join(output_dir, "metadata.csv"), index=False)

In [4]:
# list of junction files 
juncs = "/gpfs/commons/groups/knowles_lab/Karin/Leaflet-analysis-WD/HUMAN_SPLICING_FOUNDATION/ATSE_mapper/junction_files_TS_20250920.txt"
juncs_df = pd.read_csv(juncs, sep="\t", header=None)
juncs_df.columns = ["full_path"]

# Extract clean name between _raw_per_ and _junctions_with_barcodes
def extract_core_id(path):
    match = re.search(r"_raw_per_(.+?)_junctions_with_barcodes", path)
    if match:
        return match.group(1)
    else:
        print(f"Warning: Could not extract from {path}")
        return None

# Apply to get 'old_index'
juncs_df["old_index"] = juncs_df["full_path"].apply(extract_core_id)

# Show a few
print(juncs_df.head())

                                           full_path  \
0  /gpfs/commons/projects/CZI-tabula-sapiens/Leaf...   
1  /gpfs/commons/projects/CZI-tabula-sapiens/Leaf...   
2  /gpfs/commons/projects/CZI-tabula-sapiens/Leaf...   
3  /gpfs/commons/projects/CZI-tabula-sapiens/Leaf...   
4  /gpfs/commons/projects/CZI-tabula-sapiens/Leaf...   

                                           old_index  
0  TSP3_smartseq2_B114669_G3_B133703_G3_Eye_noCor...  
1  TSP14_smartseq2_B134540_L9_B002625_L9_LI_proxi...  
2  TSP14_smartseq2_B134101_K21_D102110_K21_Saliva...  
3  TSP14_smartseq2_B134025_F7_D101267_F7_LymphNod...  
4  TSP7_smartseq2_B134144_O8_B133911_O8_Tongue_an...  


In [5]:
full_paths = juncs_df.merge(ts_metadata, on="old_index")

In [6]:
# save just the full paths to a file
full_paths[["full_path"]].to_csv("/gpfs/commons/groups/knowles_lab/Karin/Leaflet-analysis-WD/HUMAN_SPLICING_FOUNDATION/ATSE_mapper/junction_files_TS_subset.txt", sep="\t", index=False, header=False)

In [7]:
print(full_paths.shape)

(40884, 8)
