In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import pyensembl
from tqdm import tqdm


In [None]:

data = pyensembl.EnsemblRelease(109)
data.download()
data.index()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/steveyin/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/steveyin/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/steveyin/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


In [3]:
adata = sc.read_h5ad('Norman_2019.h5ad')

In [11]:
adata.var['index'].shape

(19018,)

In [None]:
gene_names = adata.var['index']

gene_data = []
for gene_name in tqdm(gene_names):
    try:
        gene = data.gene_by_id(gene_name)
        gene_data.append({
            'gene_name': gene_name,
            'contig': gene.contig,
            'start': gene.start,
            'end': gene.end
        })
    except ValueError:
        # This happens if the gene name is not found in the database
        gene_data.append({
            'gene_name': gene_name,
            'contig': None,
            'start': None,
            'end': None
        })

gene_info_df = pd.DataFrame(gene_data)

100%|██████████| 19018/19018 [00:00<00:00, 1633738.32it/s]

            gene_name contig        start          end
8000  ENSG00000101843      X  108084207.0  108091549.0
8001  ENSG00000101844      X  108091668.0  108154671.0
8002  ENSG00000197565      X  108155607.0  108439497.0
8003  ENSG00000188153      X  108439838.0  108697545.0
8004  ENSG00000101888      X  109535781.0  109544698.0
8005  ENSG00000176076      X  109623700.0  109625172.0
8006  ENSG00000068366      X  109624244.0  109733403.0
8007  ENSG00000157600      X  110002631.0  110182734.0
8008  ENSG00000101935      X  110194186.0  110440318.0
8009  ENSG00000077264      X  110944285.0  111227361.0





In [24]:
ind = 8000
print(gene_info_df[ind:ind+10])


            gene_name contig        start          end
8000  ENSG00000101843      X  108084207.0  108091549.0
8001  ENSG00000101844      X  108091668.0  108154671.0
8002  ENSG00000197565      X  108155607.0  108439497.0
8003  ENSG00000188153      X  108439838.0  108697545.0
8004  ENSG00000101888      X  109535781.0  109544698.0
8005  ENSG00000176076      X  109623700.0  109625172.0
8006  ENSG00000068366      X  109624244.0  109733403.0
8007  ENSG00000157600      X  110002631.0  110182734.0
8008  ENSG00000101935      X  110194186.0  110440318.0
8009  ENSG00000077264      X  110944285.0  111227361.0


In [28]:
chromosome_lengths = {
    # Autosomes
    '1': 248956422,
    '2': 242193529,
    '3': 198295559,
    '4': 190214555,
    '5': 181538259,
    '6': 170805979,
    '7': 159345973,
    '8': 145138636,
    '9': 138394717,
    '10': 133797422,
    '11': 135086622,
    '12': 133275309,
    '13': 114364328,
    '14': 107043718,
    '15': 101991189,
    '16': 90338345,
    '17': 83257441,
    '18': 80373285,
    '19': 58617616,
    '20': 64444167,
    '21': 46709983,
    '22': 50818468,
    # Sex Chromosomes
    'X': 156040895,
    'Y': 57227415
}

chr_names = [str(i) for i in range(1, 23)] + ['X', 'Y']
chromosome_map = {name: i for i, name in enumerate(chr_names)}

In [None]:
def create_positional_encoding(row):
    """Takes a row from our gene_info_df and creates the encoding vector."""
    encoding = np.zeros(24)
    
    contig = row['contig']
    start = row['start']
    
    if pd.notna(contig) and contig in chromosome_map:
        chr_index = chromosome_map[contig]
        chr_length = chromosome_lengths[contig]
        
        # Calculate normalized position (the magnitude)
        normalized_position = start / chr_length
        encoding[chr_index] = normalized_position
        
    return encoding

positional_encodings = np.array(gene_info_df.apply(create_positional_encoding, axis=1).tolist())

print("Shape of positional encoding matrix:", positional_encodings.shape)

adata.varm['positional_encoding'] = positional_encodings

Shape of positional encoding matrix: (19018, 24)


['guide_identity',
 'read_count',
 'UMI_count',
 'coverage',
 'gemgroup',
 'good_coverage',
 'number_of_cells',
 'guide_AHR',
 'guide_ARID1A',
 'guide_ARRDC3',
 'guide_ATL1',
 'guide_BAK1',
 'guide_BCL2L11',
 'guide_BCORL1',
 'guide_BPGM',
 'guide_C19orf26',
 'guide_C3orf72',
 'guide_CBFA2T3',
 'guide_CBL',
 'guide_CDKN1A',
 'guide_CDKN1B',
 'guide_CDKN1C',
 'guide_CEBPA',
 'guide_CEBPB',
 'guide_CEBPE',
 'guide_CELF2',
 'guide_CITED1',
 'guide_CKS1B',
 'guide_CLDN6',
 'guide_CNN1',
 'guide_CNNM4',
 'guide_COL1A1',
 'guide_COL2A1',
 'guide_CSRNP1',
 'guide_DLX2',
 'guide_DUSP9',
 'guide_EGR1',
 'guide_ELMSAN1',
 'guide_ETS2',
 'guide_FEV',
 'guide_FOSB',
 'guide_FOXA1',
 'guide_FOXA3',
 'guide_FOXF1',
 'guide_FOXL2',
 'guide_FOXO4',
 'guide_GLB1L2',
 'guide_HES7',
 'guide_HK2',
 'guide_HNF4A',
 'guide_HOXA13',
 'guide_HOXB9',
 'guide_HOXC13',
 'guide_IER5L',
 'guide_IGDCC3',
 'guide_IKZF3',
 'guide_IRF1',
 'guide_ISL2',
 'guide_JUN',
 'guide_KIAA1804',
 'guide_KIF18B',
 'guide_KIF2C',
