In [1]:
import numpy as np
import pandas as pd
import gget
import glob
import networkx as nx
import os
import time
import seaborn as sns
import gget
import scanpy as sc
import anndata as an
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import scipy
import scanpy.external as sce

sc.settings.verbosity = 3  

# Load sc data

In [2]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/scanpy/merged_scfib.h5ad"
adata = sc.read_h5ad(fpath)
adata.obs['UMAP 1'] = adata.obsm['X_umap'][:, 0]
adata.obs['UMAP 2'] = adata.obsm['X_umap'][:, 1]
sc.logging.print_memory_usage()
adata

Memory usage: current 7.93 GB, difference +7.93 GB


AnnData object with n_obs × n_vars = 16130 × 13579
    obs: 'cluster_str', 'dataset', 'pred_phase', 'leiden', 'n_genes', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'new_cluster', 'UMAP 1', 'UMAP 2'
    var: 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: 'cluster_str_colors', 'dataset_colors', 'hvg', 'log1p', 'neighbors', 'new_cluster', 'new_cluster_colors', 'pca', 'pred_phase_colors', 'umap'
    obsm: 'X_pca', 'X_scanorama', 'X_umap'
    varm: 'PCs'
    layers: 'combat', 'counts', 'log_norm', 'magic', 'norm'
    obsp: 'connectivities', 'distances'

In [3]:
adata.obs_names[-10]

'TGTGCTGGTTGCTTAA_scFib'

# Load isoquant data

In [4]:
fpaths = {
    'scFib' : '/scratch/indikar_root/indikar1/shared_data/single_cell_fibroblast/isoforms/isoform_data.parquet',
    'iHSC' : '/scratch/indikar_root/indikar1/cstansbu/HSC/isoforms/isoform_data.parquet',
}

columns = [
    'cell_id',
    'UMI',
    'gene_name',
    'transcript_name',
]


result = []

for k, v in fpaths.items():
    start_time = time.time()
    print(f"\nProcessing: {k}")
    print(f"Loading parquet file: {v}")
    df = pd.read_parquet(v, columns=columns)
    print(f"  - Initial shape: {df.shape}")

    print("Dropping duplicate rows...")
    df = df.drop_duplicates()
    print(f"  - Shape after deduplication: {df.shape}")

    print("Creating 'barcode' column...")
    df['barcode'] = df['cell_id'] + f"_{k}"
    del df['cell_id']

    print("Filtering rows based on 'adata.obs_names'...") 
    df = df[df['barcode'].isin(adata.obs_names)]
    print(f"  - Final shape: {df.shape}")
    
    
    print("Grouping transcripts by UMI...") 
    x = df.groupby(['barcode', 'transcript_name'])['UMI'].nunique().reset_index()
    
    print("Pivoting transcripts as matrix...") 
    x = pd.pivot_table(
        x, 
        index='barcode', 
        columns='transcript_name',
        values='UMI',
        fill_value=0.0,
    )

    print(f"{x.shape=}")
    result.append(x)

    end_time = time.time()
    print(f"Time taken for {k}: {end_time - start_time:.2f} seconds")
    
    
result = pd.concat(result)
result = result.fillna(0.0)
print(f"{result.shape=}")
result.head()


Processing: scFib
Loading parquet file: /scratch/indikar_root/indikar1/shared_data/single_cell_fibroblast/isoforms/isoform_data.parquet
  - Initial shape: (75710721, 4)
Dropping duplicate rows...
  - Shape after deduplication: (75710721, 4)
Creating 'barcode' column...
Filtering rows based on 'adata.obs_names'...
  - Final shape: (75004573, 4)
Grouping transcripts by UMI...
Pivoting transcripts as matrix...
x.shape=(7748, 82648)
Time taken for scFib: 421.52 seconds

Processing: iHSC
Loading parquet file: /scratch/indikar_root/indikar1/cstansbu/HSC/isoforms/isoform_data.parquet
  - Initial shape: (47827803, 4)
Dropping duplicate rows...
  - Shape after deduplication: (47827803, 4)
Creating 'barcode' column...
Filtering rows based on 'adata.obs_names'...
  - Final shape: (47634864, 4)
Grouping transcripts by UMI...
Pivoting transcripts as matrix...
x.shape=(8382, 84314)
Time taken for iHSC: 317.88 seconds
result.shape=(16130, 99720)


transcript_name,A1BG-201,A1BG-202,A1BG-203,A1BG-204,A2M-201,A2M-202,A2M-204,A2M-205,A2M-206,A2M-207,...,ZW10-202,ZWILCH-204,ZWILCH-206,ZWILCH-213,ZWILCH-214,ZZEF1-203,ZZEF1-205,ZZEF1-207,ZZEF1-209,ZZZ3-207
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCAAAGGGTAGCA_scFib,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCAAAGTAAGGGT_scFib,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCATTCAGGTAGG_scFib,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCATTCCAGCCCT_scFib,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCATTCGTGACCG_scFib,0,0,0,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# BUILD BDATA

In [5]:
bdata = an.AnnData(csr_matrix(result.to_numpy()))
bdata.obs_names = result.index
bdata.var_names = result.columns

bdata.layers['raw_counts'] = csr_matrix(bdata.X.copy())

bdata

AnnData object with n_obs × n_vars = 16130 × 99720
    layers: 'raw_counts'

In [6]:
var = result.sum(axis=0).reset_index()
var.columns = ['transcript_name', 'transcript_count']
var['gene_name'] = var['transcript_name'].apply(lambda x: x.split("-")[0])
var['gene_count'] = var.groupby('gene_name')['transcript_count'].transform('sum')
var['transcript_percent'] = var['transcript_count'] / var['gene_count']
var = var.set_index('transcript_name')
var.head()

Unnamed: 0_level_0,transcript_count,gene_name,gene_count,transcript_percent
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1BG-201,4.0,A1BG,11582.0,0.000345
A1BG-202,9.0,A1BG,11582.0,0.000777
A1BG-203,68.0,A1BG,11582.0,0.005871
A1BG-204,11501.0,A1BG,11582.0,0.993006
A2M-201,1368.0,A2M,1514.0,0.903567


In [7]:
obs = result.sum(axis=1).reset_index()
obs.columns = ['barcode', 'transcript_count']
obs = obs.set_index('barcode')

obs = pd.merge(
    obs, 
    adata.obs.copy(),
    how='left',
    right_index=True,
    left_index=True,
)

obs.head()

Unnamed: 0_level_0,transcript_count,cluster_str,dataset,pred_phase,leiden,n_genes,n_genes_by_counts,total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,pct_counts_in_top_500_genes,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,total_counts_hb,pct_counts_hb,new_cluster,UMAP 1,UMAP 2
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCAAAGGGTAGCA_scFib,14275.0,,scFib,G2M,0,3700,3700,16000,29.8875,40.80625,...,65.70625,0,0.0,47,0.29375,0,0.0,1,0.286714,14.078166
AAACCAAAGTAAGGGT_scFib,7402.0,,scFib,G1,1,2278,2278,7862,34.991096,46.731112,...,71.712033,0,0.0,14,0.178072,0,0.0,2,3.464725,5.875223
AAACCATTCAGGTAGG_scFib,13329.0,,scFib,G2M,0,3713,3713,15607,28.384699,38.75825,...,64.64407,0,0.0,49,0.313962,1,0.006407,1,0.63837,12.075577
AAACCATTCCAGCCCT_scFib,10586.0,,scFib,S,1,2446,2446,10504,38.442498,51.266184,...,75.361767,0,0.0,16,0.152323,0,0.0,2,3.479215,7.809453
AAACCATTCGTGACCG_scFib,7787.0,,scFib,G1,1,2580,2580,8774,29.735582,40.733987,...,67.996353,0,0.0,28,0.319125,0,0.0,2,3.956977,6.376747


In [8]:
bdata.var = var
bdata.obs = obs

bdata

AnnData object with n_obs × n_vars = 16130 × 99720
    obs: 'transcript_count', 'cluster_str', 'dataset', 'pred_phase', 'leiden', 'n_genes', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'new_cluster', 'UMAP 1', 'UMAP 2'
    var: 'transcript_count', 'gene_name', 'gene_count', 'transcript_percent'
    layers: 'raw_counts'

In [9]:
# Normalizing to median total counts
sc.pp.normalize_total(bdata, target_sum=1e6)
bdata.layers["norm"] = csr_matrix(bdata.X.astype('float32').copy())
sc.pp.log1p(bdata)
bdata.layers["log_norm"] = csr_matrix(bdata.X.astype('float32').copy())

normalizing counts per cell
    finished (0:00:00)


In [10]:
sc.pp.highly_variable_genes(
    bdata,
)

sc.tl.pca(
    bdata,
    mask_var='highly_variable',
)

sc.pp.neighbors(bdata,)

sc.tl.umap(
    bdata,
)

extracting highly variable genes
    finished (0:00:02)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
computing PCA
    with n_comps=50
    finished (0:00:05)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:28)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:16)


In [11]:
# combat
sc.pp.combat(
    bdata, 
    key='dataset',
    covariates=['total_counts', 'n_genes'],
)

bdata.X = csr_matrix(bdata.X.astype('float32'))
bdata.layers["combat"] = bdata.X.copy()

# set current layer
bdata.X = bdata.layers["raw_counts"]
bdata

... storing 'gene_name' as categorical


Standardizing Data across genes.

Found 2 batches

Found 2 numerical variables:
	total_counts, n_genes

Fitting L/S model and finding priors

Finding parametric adjustments

Adjusting data



AnnData object with n_obs × n_vars = 16130 × 99720
    obs: 'transcript_count', 'cluster_str', 'dataset', 'pred_phase', 'leiden', 'n_genes', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'new_cluster', 'UMAP 1', 'UMAP 2'
    var: 'transcript_count', 'gene_name', 'gene_count', 'transcript_percent', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'raw_counts', 'norm', 'log_norm', 'combat'
    obsp: 'distances', 'connectivities'

# MAGIC

In [12]:
sce.pp.magic(
    bdata,
    name_list='all_genes',
    solver='approximate',
    copy=False,
)

bdata.layers["magic"] = csr_matrix(bdata.X.astype('float32').copy())

computing MAGIC
    finished (0:01:59)


In [13]:
bdata

AnnData object with n_obs × n_vars = 16130 × 99720
    obs: 'transcript_count', 'cluster_str', 'dataset', 'pred_phase', 'leiden', 'n_genes', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'new_cluster', 'UMAP 1', 'UMAP 2'
    var: 'transcript_count', 'gene_name', 'gene_count', 'transcript_percent', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'raw_counts', 'norm', 'log_norm', 'combat', 'magic'
    obsp: 'distances', 'connectivities'

In [14]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [15]:
outpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/scanpy/merged_isoforms.h5ad"
bdata.write(outpath)

bdata

AnnData object with n_obs × n_vars = 16130 × 99720
    obs: 'transcript_count', 'cluster_str', 'dataset', 'pred_phase', 'leiden', 'n_genes', 'n_genes_by_counts', 'total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'new_cluster', 'UMAP 1', 'UMAP 2'
    var: 'transcript_count', 'gene_name', 'gene_count', 'transcript_percent', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'raw_counts', 'norm', 'log_norm', 'combat', 'magic'
    obsp: 'distances', 'connectivities'

In [None]:
break

In [None]:
df.head()

In [None]:
x = df.groupby(['barcode', 'transcript_name'])['UMI'].nunique().reset_index()

x = pd.pivot_table(
    x, 
    index='barcode', 
    columns='transcript_name',
    values='UMI',
    fill_value=0.0,
)

print(f"{x.shape=}")
x.head()

In [None]:
list(df.columns)

In [None]:
break

In [None]:
break

In [None]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/isoforms/isoform_data.parquet"

df = pd.read_parquet(fpath)
df = df[df['cell_id'].isin(adata.obs_names)] # filter out undetected CBs

# add nulti-map flag
df['n_isoforms'] = df.groupby('gene_name')['transcript_name'].transform('nunique')

print(f"{df.shape=}")
df = pd.merge(
    df, 
    adata.obs,
    how='left',
    left_on='cell_id',
    right_index=True,
)

outpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/isoforms/isoquant.parquet"
df.to_parquet(outpath, index=False)
df.head()