This is a test notebook to check running remotely

### Global Imports

In [2]:
import torch
print("GPU available: ", torch.cuda.is_available())
import anndata as ad
import pandas as pd

GPU available:  True


### Setup Global settings

In [3]:
DATA_FOLDER = r"/home/bar/Data"
data_file_path = rf"{DATA_FOLDER}/allsamples_genelevel_RNAandChIC_counts.filt-norm.h5ad"

### Reading the data

In [4]:
adata = ad.read_h5ad(data_file_path)

In [8]:
# exploring the data
# Basic info
print(adata)

AnnData object with n_obs × n_vars = 37265 × 24877
    obs: 'obs_names', 'sample', 'plate_batch', 'barcode_vasa', 'day', 'rep', 'mark', 'plate_grp', 'barcode_nla', 'seqrun', 'exp_batch', 'day_rep', 'batch_day_rep', 'sizefactors_deconv_spliced', 'sizefactors_deconv_unspliced', 'sizefactors_deconv_chic', 'sizefactors_deconv_total'
    var: 'gene_maxlen', 'tx_maxlen', 'gene_meanlen', 'tx_meanlen', 'Symbol', 'Biotype', 'Chromosome', 'sum_exon_length', 'sum_intron_length', 'sum_amb_length', 'total_gene_length', 'eff_gene_length', 'eff_tx_length'
    layers: 'chic', 'spliced', 'total', 'unspliced'


In [9]:
# View the shape (cells x genes)
print(adata.shape)

(37265, 24877)


In [10]:
# See variable names (genes)
print(adata.var_names)

Index(['ENSMUSG00000030917.14', 'ENSMUSG00000044948.18',
       'ENSMUSG00000001014.7', 'ENSMUSG00000035296.15', 'ENSMUSG00000038916.8',
       'ENSMUSG00000034216.13', 'ENSMUSG00000049303.11',
       'ENSMUSG00000040648.16', 'ENSMUSG00000095576.6',
       'ENSMUSG00000037012.19',
       ...
       'ENSMUSG00000028797.21', 'ENSMUSG00000039787.14',
       'ENSMUSG00000022313.11', 'ENSMUSG00000028940.3', 'ENSMUSG00000101930.2',
       'ENSMUSG00000025867.9', 'ENSMUSG00000103833.2', 'ENSMUSG00000020891.12',
       'ENSMUSG00000036526.9', 'ENSMUSG00000045854.5'],
      dtype='object', length=24877)


In [11]:
# See observation names (cells)
print(adata.obs_names)

Index(['Gastd5-rep2-H3K27me3-1::AACCTGCT', 'Gastd5-rep2-H3K27me3-1::AATGGTGG',
       'Gastd5-rep2-H3K27me3-1::AGTACGTG', 'Gastd5-rep2-H3K27me3-1::ATTTAGCG',
       'Gastd5-rep2-H3K27me3-1::AGCAGAAC', 'Gastd5-rep2-H3K27me3-1::AAACAGGC',
       'Gastd5-rep2-H3K27me3-1::ACGTATCC', 'Gastd5-rep2-H3K27me3-1::AATCATGC',
       'Gastd5-rep2-H3K27me3-1::AACTCTGG', 'Gastd5-rep2-H3K27me3-1::AAAGCGGA',
       ...
       'Gastd7-rep3-H3K4me3-6::TGTTAAGC', 'Gastd7-rep3-H3K4me3-6::TGATCCTC',
       'Gastd7-rep3-H3K4me3-6::TTTGAGGG', 'Gastd7-rep3-H3K4me3-6::TCAGCACT',
       'Gastd7-rep3-H3K4me3-6::TCCGAACA', 'Gastd7-rep3-H3K4me3-6::TTCAGCGT',
       'Gastd7-rep3-H3K4me3-6::TGACCAAG', 'Gastd7-rep3-H3K4me3-6::TCTGTGAG',
       'Gastd7-rep3-H3K4me3-6::TGTACCAA', 'Gastd7-rep3-H3K4me3-6::TTCGGAAA'],
      dtype='object', length=37265)


In [12]:
# View metadata for cells
print(adata.obs.head())

                                 obs_names                  sample  \
Gastd5-rep2-H3K27me3-1::AACCTGCT  AACCTGCT  Gastd5-rep2-H3K27me3-1   
Gastd5-rep2-H3K27me3-1::AATGGTGG  AATGGTGG  Gastd5-rep2-H3K27me3-1   
Gastd5-rep2-H3K27me3-1::AGTACGTG  AGTACGTG  Gastd5-rep2-H3K27me3-1   
Gastd5-rep2-H3K27me3-1::ATTTAGCG  ATTTAGCG  Gastd5-rep2-H3K27me3-1   
Gastd5-rep2-H3K27me3-1::AGCAGAAC  AGCAGAAC  Gastd5-rep2-H3K27me3-1   

                                 plate_batch barcode_vasa day   rep      mark  \
Gastd5-rep2-H3K27me3-1::AACCTGCT          36     AACCTGCT   5  rep2  H3K27me3   
Gastd5-rep2-H3K27me3-1::AATGGTGG          36     AATGGTGG   5  rep2  H3K27me3   
Gastd5-rep2-H3K27me3-1::AGTACGTG          36     AGTACGTG   5  rep2  H3K27me3   
Gastd5-rep2-H3K27me3-1::ATTTAGCG          36     ATTTAGCG   5  rep2  H3K27me3   
Gastd5-rep2-H3K27me3-1::AGCAGAAC          36     AGCAGAAC   5  rep2  H3K27me3   

                                 plate_grp barcode_nla   seqrun exp_batch  \
Gastd5-rep2-H3K

In [13]:
# View metadata for genes
print(adata.var.head())

                       gene_maxlen  tx_maxlen  gene_meanlen   tx_meanlen  \
ENSMUSG00000030917.14      18530.0       1484  16311.900000  1296.680672   
ENSMUSG00000044948.18     130646.0       2193  94567.158333   639.566667   
ENSMUSG00000001014.7        1222.0        970    661.589474   596.724138   
ENSMUSG00000035296.15      32560.0       2786  14760.890756  1609.142857   
ENSMUSG00000038916.8       55442.0       3775  54868.791667  3546.858333   

                       Symbol         Biotype Chromosome  sum_exon_length  \
ENSMUSG00000030917.14   Ldaf1  protein_coding       chr7           1490.0   
ENSMUSG00000044948.18  Cfap43  protein_coding      chr19           5800.0   
ENSMUSG00000001014.7    Icam4  protein_coding       chr9            971.0   
ENSMUSG00000035296.15    Sgcg  protein_coding      chr14           3515.0   
ENSMUSG00000038916.8    Mtcl3  protein_coding      chr10           3104.0   

                       sum_intron_length  sum_amb_length  total_gene_length  \
E

In [14]:
# Access the main data matrix
print(adata.X)

None


In [5]:
import pandas as pd

# Display first 20 rows of cell metadata as a scrollable table
pd.set_option('display.max_columns', None)
display(adata.obs.head(20))

# Display first 20 rows of gene metadata
display(adata.var.head(20))

Unnamed: 0,obs_names,sample,plate_batch,barcode_vasa,day,rep,mark,plate_grp,barcode_nla,seqrun,exp_batch,day_rep,batch_day_rep,sizefactors_deconv_spliced,sizefactors_deconv_unspliced,sizefactors_deconv_chic,sizefactors_deconv_total
Gastd5-rep2-H3K27me3-1::AACCTGCT,AACCTGCT,Gastd5-rep2-H3K27me3-1,36,AACCTGCT,5,rep2,H3K27me3,1,TCATTCCT,OUDxxxx,2022,5_rep2,5_rep2_rep2,0.742548,0.610591,1.551579,0.68107
Gastd5-rep2-H3K27me3-1::AATGGTGG,AATGGTGG,Gastd5-rep2-H3K27me3-1,36,AATGGTGG,5,rep2,H3K27me3,1,CACAATTG,OUDxxxx,2022,5_rep2,5_rep2_rep2,2.488922,1.738803,0.525895,2.038549
Gastd5-rep2-H3K27me3-1::AGTACGTG,AGTACGTG,Gastd5-rep2-H3K27me3-1,36,AGTACGTG,5,rep2,H3K27me3,1,TCGAACGG,OUDxxxx,2022,5_rep2,5_rep2_rep2,0.875846,0.937416,0.294958,0.922063
Gastd5-rep2-H3K27me3-1::ATTTAGCG,ATTTAGCG,Gastd5-rep2-H3K27me3-1,36,ATTTAGCG,5,rep2,H3K27me3,1,ACACTGAT,OUDxxxx,2022,5_rep2,5_rep2_rep2,1.443247,0.786969,3.012706,1.15762
Gastd5-rep2-H3K27me3-1::AGCAGAAC,AGCAGAAC,Gastd5-rep2-H3K27me3-1,36,AGCAGAAC,5,rep2,H3K27me3,1,GTAAGTGT,OUDxxxx,2022,5_rep2,5_rep2_rep2,0.326336,0.906376,0.149551,0.664496
Gastd5-rep2-H3K27me3-1::AAACAGGC,AAACAGGC,Gastd5-rep2-H3K27me3-1,36,AAACAGGC,5,rep2,H3K27me3,1,GTCTGATT,OUDxxxx,2022,5_rep2,5_rep2_rep2,0.205714,0.833062,0.897365,0.517123
Gastd5-rep2-H3K27me3-1::ACGTATCC,ACGTATCC,Gastd5-rep2-H3K27me3-1,36,ACGTATCC,5,rep2,H3K27me3,1,ATCATCGC,OUDxxxx,2022,5_rep2,5_rep2_rep2,0.504493,0.480826,0.861429,0.510936
Gastd5-rep2-H3K27me3-1::AATCATGC,AATCATGC,Gastd5-rep2-H3K27me3-1,36,AATCATGC,5,rep2,H3K27me3,1,CGAGTAGC,OUDxxxx,2022,5_rep2,5_rep2_rep2,1.560123,1.296598,0.192085,1.473266
Gastd5-rep2-H3K27me3-1::AACTCTGG,AACTCTGG,Gastd5-rep2-H3K27me3-1,36,AACTCTGG,5,rep2,H3K27me3,1,CGCGGTGG,OUDxxxx,2022,5_rep2,5_rep2_rep2,1.030577,0.875572,1.104958,0.944002
Gastd5-rep2-H3K27me3-1::AAAGCGGA,AAAGCGGA,Gastd5-rep2-H3K27me3-1,36,AAAGCGGA,5,rep2,H3K27me3,1,TGGCCTAT,OUDxxxx,2022,5_rep2,5_rep2_rep2,0.180242,0.689572,1.253414,0.442973


Unnamed: 0,gene_maxlen,tx_maxlen,gene_meanlen,tx_meanlen,Symbol,Biotype,Chromosome,sum_exon_length,sum_intron_length,sum_amb_length,total_gene_length,eff_gene_length,eff_tx_length
ENSMUSG00000030917.14,18530.0,1484,16311.9,1296.680672,Ldaf1,protein_coding,chr7,1490.0,16678.0,472.0,18640.0,18530.0,1484.0
ENSMUSG00000044948.18,130646.0,2193,94567.158333,639.566667,Cfap43,protein_coding,chr19,5800.0,150850.0,1095.0,157745.0,130646.0,2193.0
ENSMUSG00000001014.7,1222.0,970,661.589474,596.724138,Icam4,protein_coding,chr9,971.0,113.0,139.0,1223.0,1222.0,970.0
ENSMUSG00000035296.15,32560.0,2786,14760.890756,1609.142857,Sgcg,protein_coding,chr14,3515.0,35554.0,307.0,39376.0,32560.0,2786.0
ENSMUSG00000038916.8,55442.0,3775,54868.791667,3546.858333,Mtcl3,protein_coding,chr10,3104.0,51617.0,744.0,55465.0,55442.0,3104.0
ENSMUSG00000034216.13,9712.0,4054,8983.608333,3963.8,Vps18,protein_coding,chr2,4056.0,4928.0,730.0,9714.0,9712.0,4054.0
ENSMUSG00000049303.11,29658.0,3910,19283.933333,1885.484536,Syt12,protein_coding,chr19,3503.0,26001.0,2036.0,31540.0,29658.0,3503.0
ENSMUSG00000040648.16,64364.0,9145,63688.833333,6392.633333,Ppip5k2,protein_coding,chr1,5882.0,52259.0,6223.0,64364.0,64364.0,5882.0
ENSMUSG00000095576.6,17045.0,431,7595.318681,303.375,Fmo6,protein_coding,chr1,1886.0,19008.0,71.0,20965.0,17045.0,431.0
ENSMUSG00000037012.19,102031.0,6470,76977.925,4059.633333,Hk1,protein_coding,chr10,3673.0,102522.0,4859.0,111054.0,102031.0,3673.0


In [6]:
# Show all unique values in the 'mark' column of cell metadata
unique_marks = adata.obs['mark'].unique()
print(unique_marks)

['H3K27me3', 'H3K4me3']
Categories (2, object): ['H3K4me3', 'H3K27me3']
