#### Annotation Selection of the hg38 genome

In [155]:
import pandas as pd
import numpy as np
import gffutils
import pysam

##### If this is your first time importing the gff3 database, run this command

In [120]:
#This can take a long time, so only run this once (remove the #)

#db = gffutils.create_db('gencode.v38.annotation.gff3', dbfn='test2.db', force=True, keep_order=True, 
#merge_strategy='keep_unique', sort_attribute_values=True) 

#replace gff3 with your gff3 file, dbfn can be whichever name you want to store the transposed file as


##### Feature the selected database 

In [156]:
db = gffutils.FeatureDB('test2.db', keep_order=True)

##### Select the desired range

In [157]:
chr = 'chr1' #Replace with wanted chromosome location
start = 207317678 #Replace with start coordinate
stop = 207325678 #Replace with end coordinate
slct = db.region(region=(chr, start, stop), completely_within=False)


##### Iterate through the region and transpose it to a dictionary

In [158]:
records = []
for gene in slct:
    records.append({
        "seqid": gene.seqid,
        "source": gene.source,
        "featuretype": gene.featuretype,
        "start": gene.start,
        "end": gene.end,
        "strand": gene.strand,
        "score": gene.score,
        "attributes": gene.attributes  # Attributes as a dictionary (will be expanded later)
    })

##### Next, lets integrate the cell-specific regulatory elements into this database:

In [159]:
input_file = 'hg38.all_motifs.v1.0.bed.gz' #Replace with the RE file
tabix = pysam.TabixFile(input_file)
cell_type = 'K562' #replace with reference cell type
#It may say that the index file is older than the data file, you can ignore this for now

[W::hts_idx_load3] The index file is older than the data file: hg38.all_motifs.v1.0.bed.gz.tbi


In [None]:
#Create dictionary using the input selection
for row in tabix.fetch(chr,start,stop):
    assets = row.split()
    records.append({
        "seqid": assets.pop(0),
        "source": cell_type,
        "start": int(assets.pop(0)),
        "end": int(assets.pop(0)),
        "featuretype": 'RE',
        "strand": assets.pop(2),
        "attributes": ({"ID": assets.pop(0), "matchscore": float(assets.pop(0)), "seq":assets.pop(0)})  # Attributes as a dictionary (will be expanded later)
    })

##### Expand out the attributes and format the dictionary as a database

In [161]:
df = pd.DataFrame(records)
attributes_df = df["attributes"].apply(pd.Series)
df = pd.concat([df.drop(columns=["attributes"]), attributes_df], axis=1)

display(df)

Unnamed: 0,seqid,source,featuretype,start,end,strand,score,ID,Parent,gene_id,...,level,protein_id,transcript_support_level,hgnc_id,tag,ccdsid,havana_gene,havana_transcript,matchscore,seq
0,chr1,HAVANA,five_prime_UTR,207321532,207321765,+,.,[UTR5:ENST00000367063.6],[ENST00000367063.6],[ENSG00000196352.16],...,[2],[ENSP00000356030.2],[1],[HGNC:2665],"[basic, CCDS]",[CCDS73022.1],[OTTHUMG00000036255.7],[OTTHUMT00000088210.2],,
1,chr1,HAVANA,exon,207321532,207321865,+,.,[exon:ENST00000367063.6:1],[ENST00000367063.6],[ENSG00000196352.16],...,[2],[ENSP00000356030.2],[1],[HGNC:2665],"[basic, CCDS]",[CCDS73022.1],[OTTHUMG00000036255.7],[OTTHUMT00000088210.2],,
2,chr1,HAVANA,transcript,207321532,207340766,+,.,[ENST00000367063.6],[ENSG00000196352.16],[ENSG00000196352.16],...,[2],[ENSP00000356030.2],[1],[HGNC:2665],"[basic, CCDS]",[CCDS73022.1],[OTTHUMG00000036255.7],[OTTHUMT00000088210.2],,
3,chr1,HAVANA,gene,207321532,207386804,+,.,[ENSG00000196352.16],,[ENSG00000196352.16],...,[1],,,[HGNC:2665],,,[OTTHUMG00000036255.7],,,
4,chr1,HAVANA,five_prime_UTR,207321643,207321765,+,.,[UTR5:ENST00000391921.9],[ENST00000391921.9],[ENSG00000196352.16],...,[1],[ENSP00000375788.4],[1],[HGNC:2665],"[basic, exp_conf]",,[OTTHUMG00000036255.7],[OTTHUMT00000088209.3],,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7242,chr1,K562,RE,207325670,207325680,+,,SNAI2_HUMAN.H11MO.0.A,,,...,,,,,,,,,5.703112,ACCAGGTGGC
7243,chr1,K562,RE,207325671,207325679,+,,SNAI1_HUMAN.H11MO.0.C,,,...,,,,,,,,,10.495118,CCAGGTGG
7244,chr1,K562,RE,207325671,207325679,+,,SNAI1_MOUSE.H11MO.0.C,,,...,,,,,,,,,10.495118,CCAGGTGG
7245,chr1,K562,RE,207325673,207325682,-,,Hic1_MA0739.1,,,...,,,,,,,,,8.212763,AGGTGGCAT


##### Expand out the single list attributes as strings

In [162]:
for col in df.columns:
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)

display(df)

Unnamed: 0,seqid,source,featuretype,start,end,strand,score,ID,Parent,gene_id,...,level,protein_id,transcript_support_level,hgnc_id,tag,ccdsid,havana_gene,havana_transcript,matchscore,seq
0,chr1,HAVANA,five_prime_UTR,207321532,207321765,+,.,UTR5:ENST00000367063.6,ENST00000367063.6,ENSG00000196352.16,...,2,ENSP00000356030.2,1,HGNC:2665,"[basic, CCDS]",CCDS73022.1,OTTHUMG00000036255.7,OTTHUMT00000088210.2,,
1,chr1,HAVANA,exon,207321532,207321865,+,.,exon:ENST00000367063.6:1,ENST00000367063.6,ENSG00000196352.16,...,2,ENSP00000356030.2,1,HGNC:2665,"[basic, CCDS]",CCDS73022.1,OTTHUMG00000036255.7,OTTHUMT00000088210.2,,
2,chr1,HAVANA,transcript,207321532,207340766,+,.,ENST00000367063.6,ENSG00000196352.16,ENSG00000196352.16,...,2,ENSP00000356030.2,1,HGNC:2665,"[basic, CCDS]",CCDS73022.1,OTTHUMG00000036255.7,OTTHUMT00000088210.2,,
3,chr1,HAVANA,gene,207321532,207386804,+,.,ENSG00000196352.16,,ENSG00000196352.16,...,1,,,HGNC:2665,,,OTTHUMG00000036255.7,,,
4,chr1,HAVANA,five_prime_UTR,207321643,207321765,+,.,UTR5:ENST00000391921.9,ENST00000391921.9,ENSG00000196352.16,...,1,ENSP00000375788.4,1,HGNC:2665,"[basic, exp_conf]",,OTTHUMG00000036255.7,OTTHUMT00000088209.3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7242,chr1,K562,RE,207325670,207325680,+,,SNAI2_HUMAN.H11MO.0.A,,,...,,,,,,,,,5.703112,ACCAGGTGGC
7243,chr1,K562,RE,207325671,207325679,+,,SNAI1_HUMAN.H11MO.0.C,,,...,,,,,,,,,10.495118,CCAGGTGG
7244,chr1,K562,RE,207325671,207325679,+,,SNAI1_MOUSE.H11MO.0.C,,,...,,,,,,,,,10.495118,CCAGGTGG
7245,chr1,K562,RE,207325673,207325682,-,,Hic1_MA0739.1,,,...,,,,,,,,,8.212763,AGGTGGCAT


##### Use the following code to find an RE from the human genome browser

In [None]:
lst_of_id = df[df["ID"] == 'Tcfl5_MA0632.1'] ##Replace with the RE ID
lst_of_id.dropna(how='all', axis=1, inplace=True)
lst_of_id

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lst_of_id.dropna(how='all', axis=1, inplace=True)


Unnamed: 0,seqid,source,featuretype,start,end,strand,ID,matchscore,seq
2498,chr1,K562,RE,207320875,207320885,-,Tcfl5_MA0632.1,8.485293,GGCGCGCGCG
2500,chr1,K562,RE,207320875,207320885,+,Tcfl5_MA0632.1,8.485293,GGCGCGCGCG
2514,chr1,K562,RE,207320877,207320887,-,Tcfl5_MA0632.1,8.071769,CGCGCGCGCG
2516,chr1,K562,RE,207320877,207320887,+,Tcfl5_MA0632.1,8.071769,CGCGCGCGCG
2532,chr1,K562,RE,207320879,207320889,-,Tcfl5_MA0632.1,8.071769,CGCGCGCGCG
2534,chr1,K562,RE,207320879,207320889,+,Tcfl5_MA0632.1,8.071769,CGCGCGCGCG
2548,chr1,K562,RE,207320881,207320891,-,Tcfl5_MA0632.1,8.071769,CGCGCGCGCG
2550,chr1,K562,RE,207320881,207320891,+,Tcfl5_MA0632.1,8.071769,CGCGCGCGCG
2560,chr1,K562,RE,207320883,207320893,-,Tcfl5_MA0632.1,8.071769,CGCGCGCGCG
2562,chr1,K562,RE,207320883,207320893,+,Tcfl5_MA0632.1,8.071769,CGCGCGCGCG


In [None]:
lst_of_id[round(lst_of_id['matchscore'], 4) == 8.4853] ###Replace with matchscore

Unnamed: 0,seqid,source,featuretype,start,end,strand,ID,matchscore,seq
2498,chr1,K562,RE,207320875,207320885,-,Tcfl5_MA0632.1,8.485293,GGCGCGCGCG
2500,chr1,K562,RE,207320875,207320885,+,Tcfl5_MA0632.1,8.485293,GGCGCGCGCG


In [129]:
##enhancer/promoters are specific to cell type, work on how to merge data type into table
##http://www.enhanceratlas.org/data/download/enhancer/hs/K562.bed
##http://www.enhanceratlas.org/data/AllEPs/hs/K562_EP.txt USE THIS
##http://www.enhanceratlas.org/Data_format_EP_v2.txt explanation
print(df.columns)

Index(['seqid', 'source', 'featuretype', 'start', 'end', 'strand', 'score',
       'ID', 'Parent', 'gene_id', 'transcript_id', 'gene_type', 'gene_name',
       'transcript_type', 'transcript_name', 'exon_number', 'exon_id', 'level',
       'protein_id', 'transcript_support_level', 'hgnc_id', 'tag', 'ccdsid',
       'havana_gene', 'havana_transcript', 'matchscore', 'seq'],
      dtype='object')
