In [2]:
import scanpy as sc
import pandas as pd

# LongBench Gene Data

In [2]:
################################################################
# DATA
# data source: # https://www.biorxiv.org/content/10.1101/2025.09.11.675724v1.full - PacBio long reads
# files paths 
CARMELLE_DATA_DIR = "/mnt/lareaulab/carmelle/longread_sc/lung"
AATH_DATA_DIR = "/Users/aathreyakadambi/Documents/school/berkeley/fa25/cs194/final_project/data"
AATH_LAMBDA_DATA_DIR = "/home/ubuntu/workspace/data"
DATA_DIR = AATH_LAMBDA_DATA_DIR

# sn
sn_gene = f'{DATA_DIR}/longbench_data/sn/GSM9135509_pb_sn_genes_count_matrix.tsv.gz'
# sc
sc_gene = f'{DATA_DIR}/longbench_data/sc/GSM9135508_pb_sc_genes_count_matrix.tsv.gz'

# cell line data
sc_cell_lines_path = f'{DATA_DIR}/longbench_data/cell_line_labels/sc_cell_line_labels.csv'
sn_cell_lines_path = f'{DATA_DIR}/longbench_data/cell_line_labels/sn_cell_line_labels.csv'
################################################################

In [3]:
adata_sn_gene = sc.read_csv(sn_gene).T
adata_sc_gene = sc.read_csv(sc_gene).T

In [15]:
# cell metadata
print(adata_sn_gene.n_obs)
adata_sn_gene.obs.head()

10001


AACGAAAGTCGTCGGT
AAGAACACAGTACTAC
AATGACCAGCAGCCCT
AATGACCGTCCAATCA
AATGCCACAATTGCGT


In [None]:
# gene metadata
print(adata_sn_gene.n_vars)
adata_sn_gene.var.head()

49671


ENSG00000211952.3
ENSG00000211955.2
ENSG00000211956.2
ENSG00000211957.2
ENSG00000211958.2


In [12]:
adata_sn_gene.X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(10001, 49671), dtype=float32)

In [17]:
# cell metadata
print(adata_sc_gene.n_obs)
adata_sc_gene.obs.head()

5002


AACAAGAGTGCCGGTT
ACAACCAGTTCCTAGA
ACCATTTCACAAGCTT
ACTACGAAGTTCGGTT
ACTTAGGAGCCTATCA


In [18]:
# gene metadata
print(adata_sc_gene.n_vars)
adata_sc_gene.var.head()

50394


ENSG00000212380.1
ENSG00000270814.1
ENSG00000280234.1
ENSG00000211938.2
ENSG00000211943.2


# LongBench Isoform Data

In [19]:
################################################################
# DATA
# data source: # https://www.biorxiv.org/content/10.1101/2025.09.11.675724v1.full - PacBio long reads
# files paths 
CARMELLE_DATA_DIR = "/mnt/lareaulab/carmelle/longread_sc/lung"
AATH_DATA_DIR = "/Users/aathreyakadambi/Documents/school/berkeley/fa25/cs194/final_project/data"
AATH_LAMBDA_DATA_DIR = "/home/ubuntu/workspace/data"
DATA_DIR = AATH_LAMBDA_DATA_DIR

# sn
sn_isoform = f'{DATA_DIR}/longbench_data/sn/GSM9135509_pb_sn_transcript_count_matrix.mtx.gz'
# sc
sc_isoform = f'{DATA_DIR}/longbench_data/sc/GSM9135508_pb_sc_transcript_count_matrix.mtx.gz'

sn_features_path = f'{DATA_DIR}/longbench_data/sn/GSM9135509_pb_sn_transcript_count_features.tsv.gz'
sn_cells_path = f'{DATA_DIR}/longbench_data/sn/GSM9135509_pb_sn_transcript_count_barcodes.tsv.gz'

sc_features_path = f'{DATA_DIR}/longbench_data/sc/GSM9135508_pb_sc_transcript_count_features.tsv.gz'
sc_cells_path = f'{DATA_DIR}/longbench_data/sc/GSM9135508_pb_sc_transcript_count_barcodes.tsv.gz'

# cell line data
sc_cell_lines_path = f'{DATA_DIR}/longbench_data/cell_line_labels/sc_cell_line_labels.csv'
sn_cell_lines_path = f'{DATA_DIR}/longbench_data/cell_line_labels/sn_cell_line_labels.csv'
################################################################

In [20]:
adata_sn_isoform = sc.read_mtx(sn_isoform)
adata_sc_isoform = sc.read_mtx(sc_isoform)

In [21]:
# cell metadata
print(adata_sn_isoform.n_obs)
adata_sn_isoform.obs.head()

10001


0
1
2
3
4


In [22]:
# gene metadata
print(adata_sn_isoform.n_vars)
adata_sn_isoform.var.head()

127013


0
1
2
3
4


In [23]:
adata_sn_isoform.X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 37515983 stored elements and shape (10001, 127013)>

In [24]:
# cell metadata
print(adata_sc_isoform.n_obs)
adata_sc_isoform.obs.head()

5002


0
1
2
3
4


In [25]:
# gene metadata
print(adata_sc_isoform.n_vars)
adata_sc_isoform.var.head()

128580


0
1
2
3
4


# LongBench Cell Lines Data

In [26]:
sc_cell_lines = pd.read_csv(sc_cell_lines_path)
sn_cell_lines = pd.read_csv(sn_cell_lines_path)

In [27]:
sc_cell_lines

Unnamed: 0,cell_id,assigned_cell_line,correlation
0,AACAAGAGTGCCGGTT,H2228,0.716725
1,ACAACCAGTTCCTAGA,H69,0.735068
2,ACCATTTCACAAGCTT,H2228,0.755043
3,ACTACGAAGTTCGGTT,H211,0.553623
4,ACTTAGGAGCCTATCA,H69,0.739172
...,...,...,...
4997,TTTCATGCAACTTGGT,HCC827,0.155921
4998,GCATGATGTTTGAAAG,H211,0.140278
4999,TGCTCGTCATCCTTCG,HCC827,0.167219
5000,AGTCACATCATTCTTG,HCC827,0.153815


In [28]:
sn_cell_lines

Unnamed: 0,cell_id,assigned_cell_line,correlation
0,AACGAAAGTCGTCGGT,H69,0.411415
1,AAGAACACAGTACTAC,SHP77,0.467653
2,AATGACCAGCAGCCCT,H146,0.397853
3,AATGACCGTCCAATCA,SHP77,0.495637
4,AATGCCACAATTGCGT,H211,0.505111
...,...,...,...
9996,GTGTAACCAATCGCAT,H146,0.358700
9997,TACCGAAAGACCTCCG,H2228,0.412986
9998,TCGACCTTCAAAGAAC,H2228,0.410189
9999,TCTTTGACAAATTAGG,H146,0.393097


# CRC Gene Data

In [33]:
################################################################
# DATA
# data source: # https://www.biorxiv.org/content/10.1101/2025.09.11.675724v1.full - PacBio long reads
# files paths 
CARMELLE_DATA_DIR = "/mnt/lareaulab/carmelle/longread_sc/lung"
AATH_DATA_DIR = "/Users/aathreyakadambi/Documents/school/berkeley/fa25/cs194/final_project/data"
AATH_LAMBDA_DATA_DIR = "/home/ubuntu/workspace/data"
DATA_DIR = AATH_LAMBDA_DATA_DIR

crc_gene_counts = f'{DATA_DIR}/crc/PacBio-gene_counts_matrix_ad.h5ad'
crc_gene_data_mat =  f'{DATA_DIR}/crc/PacBio-gene_data_matrix_ad.h5ad'
################################################################

In [35]:
adata_gene_counts = sc.read_h5ad(crc_gene_counts)
adata_gene_data_mat = sc.read_h5ad(crc_gene_data_mat)

In [36]:
adata_gene_counts.X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 810861 stored elements and shape (287458, 20151)>

In [37]:
# cell metadata
print(adata_gene_counts.n_obs)
print(adata_gene_counts.obs.columns)
adata_gene_counts.obs.head()

287458
Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'library_id_IL',
       'condition', 'ClusterTop', 'ClusterMidway', 'ClusterMidway2',
       'ClusterFull', 'ClusterFull2', 'library_id_PB'],
      dtype='object')


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,library_id_IL,condition,ClusterTop,ClusterMidway,ClusterMidway2,ClusterFull,ClusterFull2,library_id_PB
TGATCTTCATAATGCC-4,,,,,,,,_PB_CB_filtered,,,PS017-T2
TTCACCGAGCGGTATG-4,,,,,,,,_PB_CB_filtered,,,PS017-T2
CTTACCGTCAATCGGT-6,,,,,,,,_PB_CB_filtered,,,PS018-T
ATTGGTCAGGATTCTA-2,,,,,,,,_PB_CB_filtered,,,PS017-T1.1
ACATCGATCCGTTGAA-4,,,,,,,,_PB_CB_filtered,,,PS017-T2


In [38]:
# gene metadata
print(adata_gene_counts.n_vars)
adata_gene_counts.var.head()

20151


AKAP17A
ANAPC1P2
ANP32A-IT1
ARHGAP27P1
ASAP1-IT1


In [40]:
adata_gene_data_mat.X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 810861 stored elements and shape (287458, 20151)>

In [None]:
adata_gene_data_mat.X - adata_gene_counts.X # not the same

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 810861 stored elements and shape (287458, 20151)>

In [41]:
# cell metadata
print(adata_gene_data_mat.n_obs)
print(adata_gene_data_mat.obs.columns)
adata_gene_data_mat.obs.head()

287458
Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'library_id_IL',
       'condition', 'ClusterTop', 'ClusterMidway', 'ClusterMidway2',
       'ClusterFull', 'ClusterFull2', 'library_id_PB'],
      dtype='object')


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,library_id_IL,condition,ClusterTop,ClusterMidway,ClusterMidway2,ClusterFull,ClusterFull2,library_id_PB
TGATCTTCATAATGCC-4,,,,,,,,_PB_CB_filtered,,,PS017-T2
TTCACCGAGCGGTATG-4,,,,,,,,_PB_CB_filtered,,,PS017-T2
CTTACCGTCAATCGGT-6,,,,,,,,_PB_CB_filtered,,,PS018-T
ATTGGTCAGGATTCTA-2,,,,,,,,_PB_CB_filtered,,,PS017-T1.1
ACATCGATCCGTTGAA-4,,,,,,,,_PB_CB_filtered,,,PS017-T2


In [42]:
# gene metadata
print(adata_gene_data_mat.n_vars)
adata_gene_data_mat.var.head()

20151


AKAP17A
ANAPC1P2
ANP32A-IT1
ARHGAP27P1
ASAP1-IT1


# CRC Isoform Data

In [59]:
################################################################
# DATA
# data source: # https://www.biorxiv.org/content/10.1101/2025.09.11.675724v1.full - PacBio long reads
# files paths 
CARMELLE_DATA_DIR = "/mnt/lareaulab/carmelle/longread_sc/lung"
AATH_DATA_DIR = "/Users/aathreyakadambi/Documents/school/berkeley/fa25/cs194/final_project/data"
AATH_LAMBDA_DATA_DIR = "/home/ubuntu/workspace/data"
DATA_DIR = AATH_LAMBDA_DATA_DIR

crc_isoform_counts = f'{DATA_DIR}/crc/PacBio-isoform_counts_matrix_ad.h5ad'
crc_isoform_data_mat =  f'{DATA_DIR}/crc/PacBio-isoform_data_matrix_ad.h5ad'
################################################################

In [60]:
adata_isoform_counts = sc.read_h5ad(crc_isoform_counts)
adata_isoform_data_mat = sc.read_h5ad(crc_isoform_data_mat)

In [61]:
adata_isoform_counts.X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 847698 stored elements and shape (287458, 125205)>

In [55]:
# cell metadata
print(adata_isoform_counts.n_obs)
print(adata_isoform_counts.obs.columns)
adata_isoform_counts.obs.head()

287458
Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'library_id_IL',
       'condition', 'ClusterTop', 'ClusterMidway', 'ClusterMidway2',
       'ClusterFull', 'ClusterFull2', 'library_id_PB'],
      dtype='object')


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,library_id_IL,condition,ClusterTop,ClusterMidway,ClusterMidway2,ClusterFull,ClusterFull2,library_id_PB
TGATCTTCATAATGCC-4,,,,,,,,_PB_CB_filtered,,,PS017-T2
TTCACCGAGCGGTATG-4,,,,,,,,_PB_CB_filtered,,,PS017-T2
CTTACCGTCAATCGGT-6,,,,,,,,_PB_CB_filtered,,,PS018-T
ATTGGTCAGGATTCTA-2,,,,,,,,_PB_CB_filtered,,,PS017-T1.1
ACATCGATCCGTTGAA-4,,,,,,,,_PB_CB_filtered,,,PS017-T2


In [57]:
# gene metadata
print(adata_isoform_counts.n_vars)
adata_isoform_counts.var.head()

125205


Unnamed: 0,chrom,start,end,GeneSymbol,SupportingStructCategory,WithinCAGEPeak,WithinPolyAPeak,NExons
TCONS_00011259,chr1,356239,358036,novelGene_RefSeq__LOC124900618_AS,antisense,False,False,2
TCONS_00011260,chr1,629083,629431,GENCODE__ENSG00000225972.1,full-splice_match,False,True,1
TCONS_00007975,chr1,629315,629431,GENCODE__ENSG00000225972.1,full-splice_match,False,True,1
TCONS_00000001,chr1,629639,629997,GENCODE__ENSG00000225630.1,full-splice_match,True,True,1
TCONS_00009624,chr1,631859,632028,GENCODE__ENSG00000237973.1,full-splice_match,False,False,1


In [62]:
adata_isoform_data_mat.X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 847698 stored elements and shape (287458, 125205)>

In [63]:
# cell metadata
print(adata_isoform_data_mat.n_obs)
print(adata_isoform_data_mat.obs.columns)
adata_isoform_data_mat.obs.head()

287458
Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'library_id_IL',
       'condition', 'ClusterTop', 'ClusterMidway', 'ClusterMidway2',
       'ClusterFull', 'ClusterFull2', 'library_id_PB'],
      dtype='object')


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,library_id_IL,condition,ClusterTop,ClusterMidway,ClusterMidway2,ClusterFull,ClusterFull2,library_id_PB
TGATCTTCATAATGCC-4,,,,,,,,_PB_CB_filtered,,,PS017-T2
TTCACCGAGCGGTATG-4,,,,,,,,_PB_CB_filtered,,,PS017-T2
CTTACCGTCAATCGGT-6,,,,,,,,_PB_CB_filtered,,,PS018-T
ATTGGTCAGGATTCTA-2,,,,,,,,_PB_CB_filtered,,,PS017-T1.1
ACATCGATCCGTTGAA-4,,,,,,,,_PB_CB_filtered,,,PS017-T2


In [64]:
# gene metadata
print(adata_isoform_data_mat.n_vars)
adata_isoform_data_mat.var.head()

125205


Unnamed: 0,chrom,start,end,GeneSymbol,SupportingStructCategory,WithinCAGEPeak,WithinPolyAPeak,NExons
TCONS_00011259,chr1,356239,358036,novelGene_RefSeq__LOC124900618_AS,antisense,False,False,2
TCONS_00011260,chr1,629083,629431,GENCODE__ENSG00000225972.1,full-splice_match,False,True,1
TCONS_00007975,chr1,629315,629431,GENCODE__ENSG00000225972.1,full-splice_match,False,True,1
TCONS_00000001,chr1,629639,629997,GENCODE__ENSG00000225630.1,full-splice_match,True,True,1
TCONS_00009624,chr1,631859,632028,GENCODE__ENSG00000237973.1,full-splice_match,False,False,1
