### Database creation of Telomere-to-telomere consortium CHM13 Annotations

##### Before you begin, please make sure you have the proper files downloaded

In [73]:
#Download link for CHM13 Annotations: https://github.com/marbl/CHM13?tab=readme-ov-file
##Scroll Down to find the UCSC GENCODEv35 CAT/Liftoff v2 Annotation file, which will be used for this database

In [74]:
import pandas as pd
import numpy as np
import gffutils
import pysam

##### If this is your first time importing the gff3 database, run this command

In [75]:
#This can take a long time, so only run this once (remove the #)

#db = gffutils.create_db('chm13.draft_v2.0.gene_annotation.gff3', dbfn='chm13db.db', force=True, keep_order=True, 
#merge_strategy='create_unique', sort_attribute_values=True) 

#replace gff3 with your gff3 file, dbfn can be whichever name you want to store the transposed file as


##### Feature the selected database 

In [76]:
db = gffutils.FeatureDB('chm13db.db', keep_order=True)


##### Select the desired range

In [77]:
start = 206586163
stop = 206586193
chr = 'chr1'

In [78]:
slct = db.region(region=(chr, start, stop), completely_within=False)

##### Iterate through the region and transpose it to a dictionary

In [79]:
records = []
for gene in slct:
    records.append({
        "seqid": gene.seqid,
        "source": gene.source,
        "featuretype": gene.featuretype,
        "start": gene.start,
        "end": gene.end,
        "strand": gene.strand,
        "score": gene.score,
        "attributes": gene.attributes  # Attributes as a dictionary (will be expanded later)
    })

##### Next, lets integrate the hg38 regulatory elements into this database:

In [80]:
input_file = 'hg38.archetype_motifs.v1.0.bed.gz' #Replace with the RE file
tabix = pysam.TabixFile(input_file)
source_name = 'atlas' #replace with reference type
hg38_start = 207320867 #determined from Liftover coordinate mapping
hg38_stop = 207320897
hg38_to_chm13 = start - hg38_start #transformation from hg38 to chm13
#It may say that the index file is older than the data file, you can ignore this for now

In [81]:
#Create dictionary using the input selection
for row in tabix.fetch(chr,hg38_start,hg38_stop):
    assets = row.split()
    records.append({
        "seqid": assets.pop(0),
        "source": source_name,
        "start": int(assets.pop(0)) + hg38_to_chm13,
        "end": int(assets.pop(0)) + hg38_to_chm13,
        "featuretype": 'RE',
        "strand": assets.pop(2),
        "attributes": ({"ID": assets.pop(2), "matchscore": float(assets.pop(1)), "cluster":assets.pop(0)})  # Attributes as a dictionary (will be expanded later)
    })

##### Expand out the attributes and format the dictionary as a database, dropping unnecessary columns

In [82]:
df = pd.DataFrame(records)
attributes_df = df["attributes"].apply(pd.Series)
df = pd.concat([df.drop(columns=["attributes"]), attributes_df], axis=1)


In [83]:
for col in df.columns:
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)
df.dropna(how='all', axis=1, inplace=True)
df

Unnamed: 0,seqid,source,start,end,featuretype,strand,ID,matchscore,cluster
0,chr1,atlas,206586147,206586164,RE,-,SALL4_HUMAN.H11MO.0.B,9.9542,KLF/SP/2
1,chr1,atlas,206586148,206586165,RE,-,EGR2_HUMAN.H11MO.0.A,11.5015,KLF/SP/2
2,chr1,atlas,206586149,206586169,RE,-,KLF15_HUMAN.H11MO.0.A,11.5516,GC-tract
3,chr1,atlas,206586150,206586168,RE,+,TBX3_MOUSE.H11MO.0.B,7.7390,TBX/3
4,chr1,atlas,206586151,206586168,RE,-,KLF1_MOUSE.H11MO.0.A,10.1096,KLF/SP/2
...,...,...,...,...,...,...,...,...,...
62,chr1,atlas,206586189,206586206,RE,-,EGR2_HUMAN.H11MO.0.A,7.8496,KLF/SP/2
63,chr1,atlas,206586190,206586203,RE,+,ZSCAN4_C2H2_1,3.8977,ZSCAN4
64,chr1,atlas,206586191,206586202,RE,+,KLF9_MA1107.1,8.4085,KLF/SP/1
65,chr1,atlas,206586191,206586208,RE,-,EGR2_HUMAN.H11MO.0.A,7.8496,KLF/SP/2
