This is a test notebook to check running remotely

### Global Imports

In [1]:
import torch
print("GPU available: ", torch.cuda.is_available())
import anndata as ad
import pandas as pd

GPU available:  True


  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


### Setup Global settings

In [2]:
import os

if os.name == 'nt':  # Windows
    DATA_FOLDER = r"..\Data"
else:  # Linux/Unix/Mac
    DATA_FOLDER = r"/home/bar/Data"

data_file_path = os.path.join(DATA_FOLDER, "allsamples_promoterlevel_H3K4me3_counts.h5ad")

### Reading the data

In [4]:
print(f"Loading data from: {data_file_path}")
adata = ad.read_h5ad(data_file_path)
print("Data loaded successfully.")

Loading data from: ..\Data\allsamples_promoterlevel_H3K4me3_counts.h5ad
Data loaded successfully.


In [5]:
# exploring the data
# Basic info
print(adata)

AnnData object with n_obs × n_vars = 23040 × 43579
    obs: 'index', 'barcodes', 'sample', 'plate_batch', 'barcode_nla', 'barcode_vasa', 'index_nla'
    var: 'chr', 'start', 'end', 'score', 'strand', 'Symbol', 'ID', 'geneID', 'gene_strand', 'NOGS_olap'


In [6]:
# View the shape (cells x genes)
print(adata.shape)

(23040, 43579)


In [7]:
# See variable names (genes)
print(adata.var_names)

Index(['P_43894:Gm47631:1', 'P_20460:Tmtc3:1', 'P_4314:Cep290:1',
       'P_4315:Cep290:2', 'P_4316:Cep290:3', 'P_4317:Cep290:4',
       'P_4318:Cep290:5', 'P_4319:Cep290:6', 'P_43693:Gm47956:1',
       'P_26188:Rlig1:1',
       ...
       'P_29569:Kdm5d:1', 'P_40305:Gm21996:1', 'P_40909:Gm28930:1',
       'P_40659:Gm29363:1', 'P_38753:Gm20837:1', 'P_39759:Gm28301:1',
       'P_37746:Gm21292:1', 'P_29570:Kdm5d:2', 'P_38404:Gm21721:1',
       'P_38372:Gm20821:1'],
      dtype='object', name='name', length=43579)


In [8]:
# See observation names (cells)
print(adata.obs_names)

Index(['Gastd3-rep3-H3K4me3-1::CGTCTAAT', 'Gastd3-rep3-H3K4me3-1::AGACTCGT',
       'Gastd3-rep3-H3K4me3-1::GCACGTCA', 'Gastd3-rep3-H3K4me3-1::TCAACGAC',
       'Gastd3-rep3-H3K4me3-1::ATTTAGCG', 'Gastd3-rep3-H3K4me3-1::ATACAGAC',
       'Gastd3-rep3-H3K4me3-1::TGCGTAGG', 'Gastd3-rep3-H3K4me3-1::TGGAGCTC',
       'Gastd3-rep3-H3K4me3-1::TGAATACC', 'Gastd3-rep3-H3K4me3-1::TCTCACAC',
       ...
       'mESCs-rep4-H3K4me3-6::AACTCTGG', 'mESCs-rep4-H3K4me3-6::TTAGGCAG',
       'mESCs-rep4-H3K4me3-6::CGTCGTTT', 'mESCs-rep4-H3K4me3-6::CCTGCTAT',
       'mESCs-rep4-H3K4me3-6::CAGAGAAG', 'mESCs-rep4-H3K4me3-6::GATTGTCC',
       'mESCs-rep4-H3K4me3-6::CACCTCTA', 'mESCs-rep4-H3K4me3-6::AGTTGGAC',
       'mESCs-rep4-H3K4me3-6::AGTGCAGA', 'mESCs-rep4-H3K4me3-6::CTCTGTCT'],
      dtype='object', name='index_vasa', length=23040)


In [9]:
# View metadata for cells
print(adata.obs.head())

                                                           index  barcodes  \
index_vasa                                                                   
Gastd3-rep3-H3K4me3-1::CGTCTAAT  Gastd3-rep3-H3K4me3-1::ACACACTA  ACACACTA   
Gastd3-rep3-H3K4me3-1::AGACTCGT  Gastd3-rep3-H3K4me3-1::ACACATAG  ACACATAG   
Gastd3-rep3-H3K4me3-1::GCACGTCA  Gastd3-rep3-H3K4me3-1::ACACGAGA  ACACGAGA   
Gastd3-rep3-H3K4me3-1::TCAACGAC  Gastd3-rep3-H3K4me3-1::ACACTATC  ACACTATC   
Gastd3-rep3-H3K4me3-1::ATTTAGCG  Gastd3-rep3-H3K4me3-1::ACACTGAT  ACACTGAT   

                                                sample plate_batch  \
index_vasa                                                           
Gastd3-rep3-H3K4me3-1::CGTCTAAT  Gastd3-rep3-H3K4me3-1           0   
Gastd3-rep3-H3K4me3-1::AGACTCGT  Gastd3-rep3-H3K4me3-1           0   
Gastd3-rep3-H3K4me3-1::GCACGTCA  Gastd3-rep3-H3K4me3-1           0   
Gastd3-rep3-H3K4me3-1::TCAACGAC  Gastd3-rep3-H3K4me3-1           0   
Gastd3-rep3-H3K4me3-1::ATTTAGCG  

In [10]:
# View metadata for genes
print(adata.var.head())

                     chr      start        end score strand   Symbol       ID  \
name                                                                            
P_43894:Gm47631:1  chr10  100216693  100221693     0      .  Gm47631  P_43894   
P_20460:Tmtc3:1    chr10  100320170  100325170     0      .    Tmtc3  P_20460   
P_4314:Cep290:1    chr10  100321784  100326784     0      .   Cep290   P_4314   
P_4315:Cep290:2    chr10  100332192  100337192     0      .   Cep290   P_4315   
P_4316:Cep290:3    chr10  100342587  100347587     0      .   Cep290   P_4316   

                                  geneID gene_strand  \
name                                                   
P_43894:Gm47631:1   ENSMUSG00000112485.2           -   
P_20460:Tmtc3:1    ENSMUSG00000036676.15           -   
P_4314:Cep290:1    ENSMUSG00000019971.11           +   
P_4315:Cep290:2    ENSMUSG00000019971.11           +   
P_4316:Cep290:3    ENSMUSG00000019971.11           +   

                                       

In [11]:
# Access the main data matrix
print(adata.X)

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 22810754 stored elements and shape (23040, 43579)>
  Coords	Values
  (0, 10474)	1.0
  (0, 17943)	1.0
  (0, 36356)	1.0
  (0, 36355)	1.0
  (0, 2526)	1.0
  (0, 39004)	3.0
  (0, 20691)	1.0
  (0, 24542)	1.0
  (0, 24056)	16.0
  (0, 24085)	1.0
  (0, 12643)	1.0
  (0, 12642)	1.0
  (0, 4311)	1.0
  (0, 4447)	1.0
  (0, 23909)	2.0
  (0, 14401)	1.0
  (0, 20656)	1.0
  (0, 1093)	1.0
  (0, 37590)	1.0
  (0, 18793)	1.0
  (0, 12166)	1.0
  (0, 10344)	1.0
  (0, 3956)	1.0
  (0, 38949)	1.0
  (0, 11603)	1.0
  :	:
  (23039, 7573)	1.0
  (23039, 254)	1.0
  (23039, 253)	1.0
  (23039, 13806)	1.0
  (23039, 1013)	1.0
  (23039, 14977)	1.0
  (23039, 14464)	1.0
  (23039, 40857)	1.0
  (23039, 18320)	1.0
  (23039, 391)	1.0
  (23039, 32738)	2.0
  (23039, 32737)	2.0
  (23039, 20834)	1.0
  (23039, 18917)	1.0
  (23039, 23585)	1.0
  (23039, 31609)	1.0
  (23039, 30397)	1.0
  (23039, 11124)	2.0
  (23039, 21869)	1.0
  (23039, 4251)	1.0
  (23039, 10531)	2.0
  (23039, 28

In [12]:
import pandas as pd

# Display first 20 rows of cell metadata as a scrollable table
pd.set_option('display.max_columns', None)
display(adata.obs.head(20))

# Display first 20 rows of gene metadata
display(adata.var.head(20))

Unnamed: 0_level_0,index,barcodes,sample,plate_batch,barcode_nla,barcode_vasa,index_nla
index_vasa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Gastd3-rep3-H3K4me3-1::CGTCTAAT,Gastd3-rep3-H3K4me3-1::ACACACTA,ACACACTA,Gastd3-rep3-H3K4me3-1,0,ACACACTA,CGTCTAAT,Gastd3-rep3-H3K4me3-1::ACACACTA
Gastd3-rep3-H3K4me3-1::AGACTCGT,Gastd3-rep3-H3K4me3-1::ACACATAG,ACACATAG,Gastd3-rep3-H3K4me3-1,0,ACACATAG,AGACTCGT,Gastd3-rep3-H3K4me3-1::ACACATAG
Gastd3-rep3-H3K4me3-1::GCACGTCA,Gastd3-rep3-H3K4me3-1::ACACGAGA,ACACGAGA,Gastd3-rep3-H3K4me3-1,0,ACACGAGA,GCACGTCA,Gastd3-rep3-H3K4me3-1::ACACGAGA
Gastd3-rep3-H3K4me3-1::TCAACGAC,Gastd3-rep3-H3K4me3-1::ACACTATC,ACACTATC,Gastd3-rep3-H3K4me3-1,0,ACACTATC,TCAACGAC,Gastd3-rep3-H3K4me3-1::ACACTATC
Gastd3-rep3-H3K4me3-1::ATTTAGCG,Gastd3-rep3-H3K4me3-1::ACACTGAT,ACACTGAT,Gastd3-rep3-H3K4me3-1,0,ACACTGAT,ATTTAGCG,Gastd3-rep3-H3K4me3-1::ACACTGAT
Gastd3-rep3-H3K4me3-1::ATACAGAC,Gastd3-rep3-H3K4me3-1::ACAGTACG,ACAGTACG,Gastd3-rep3-H3K4me3-1,0,ACAGTACG,ATACAGAC,Gastd3-rep3-H3K4me3-1::ACAGTACG
Gastd3-rep3-H3K4me3-1::TGCGTAGG,Gastd3-rep3-H3K4me3-1::ACATACGT,ACATACGT,Gastd3-rep3-H3K4me3-1,0,ACATACGT,TGCGTAGG,Gastd3-rep3-H3K4me3-1::ACATACGT
Gastd3-rep3-H3K4me3-1::TGGAGCTC,Gastd3-rep3-H3K4me3-1::ACATCACA,ACATCACA,Gastd3-rep3-H3K4me3-1,0,ACATCACA,TGGAGCTC,Gastd3-rep3-H3K4me3-1::ACATCACA
Gastd3-rep3-H3K4me3-1::TGAATACC,Gastd3-rep3-H3K4me3-1::ACATCTAT,ACATCTAT,Gastd3-rep3-H3K4me3-1,0,ACATCTAT,TGAATACC,Gastd3-rep3-H3K4me3-1::ACATCTAT
Gastd3-rep3-H3K4me3-1::TCTCACAC,Gastd3-rep3-H3K4me3-1::ACGACGCG,ACGACGCG,Gastd3-rep3-H3K4me3-1,0,ACGACGCG,TCTCACAC,Gastd3-rep3-H3K4me3-1::ACGACGCG


Unnamed: 0_level_0,chr,start,end,score,strand,Symbol,ID,geneID,gene_strand,NOGS_olap
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P_43894:Gm47631:1,chr10,100216693,100221693,0,.,Gm47631,P_43894,ENSMUSG00000112485.2,-,I:133955;E:174843
P_20460:Tmtc3:1,chr10,100320170,100325170,0,.,Tmtc3,P_20460,ENSMUSG00000036676.15,-,I:133968;E:174859;E:174860;E:174861;E:174862;E...
P_4314:Cep290:1,chr10,100321784,100326784,0,.,Cep290,P_4314,ENSMUSG00000019971.11,+,I:128362;I:128363;I:128364;I:128365;I:128366;E...
P_4315:Cep290:2,chr10,100332192,100337192,0,.,Cep290,P_4315,ENSMUSG00000019971.11,+,I:128371;I:128372;I:128373;I:128374;E:167603;E...
P_4316:Cep290:3,chr10,100342587,100347587,0,.,Cep290,P_4316,ENSMUSG00000019971.11,+,I:128375;I:128376;E:167608;E:167609;E:167610;E...
P_4317:Cep290:4,chr10,100376300,100381300,0,.,Cep290,P_4317,ENSMUSG00000019971.11,+,I:128392;I:128393;I:128394;I:128395;I:128396;I...
P_4318:Cep290:5,chr10,100382725,100387725,0,.,Cep290,P_4318,ENSMUSG00000019971.11,+,I:128399;I:128400;I:128401;E:167637;E:167638;E...
P_4319:Cep290:6,chr10,100405795,100410795,0,.,Cep290,P_4319,ENSMUSG00000019971.11,+,I:128412;I:128413;E:167651;E:167652;E:167653;E...
P_43693:Gm47956:1,chr10,100411287,100416287,0,.,Gm47956,P_43693,ENSMUSG00000112000.2,+,I:128414;E:167655
P_26188:Rlig1:1,chr10,100421395,100426395,0,.,Rlig1,P_26188,ENSMUSG00000046567.11,-,I:133974;I:133975;E:174871;E:174872;A:69864;A:...


In [24]:
# Show all unique values in the 'geneID' column of cell metadata

print("Columns in cell metadata:")
print(adata.var.columns)

if 'geneID' in adata.var.columns:
    unique_genes = adata.var['geneID'].unique()
    print(f"Unique genes in 'geneID' column: {len(unique_genes)}")
    # print(unique_genes)
else:
    print("'geneID' column not found in cell metadata.")

print(adata.shape)
if len(unique_genes) == adata.shape[1]:
    print("All genes are unique.")

Columns in cell metadata:
Index(['chr', 'start', 'end', 'score', 'strand', 'Symbol', 'ID', 'geneID',
       'gene_strand', 'NOGS_olap'],
      dtype='object')
Unique genes in 'geneID' column: 26934
(23040, 43579)
