### Import libraries

In [None]:
import os
import re
import csv
import shutil
import gmql as gl
import pandas as pd
import numpy as np

In [None]:
# paths

root_path = './../'
data_base_path = root_path + 'data/'
raw_data_path = data_base_path + 'raw/'
extracted_path = raw_data_path + 'extracted/'
if not os.path.exists(raw_data_path):
    os.makedirs(extracted_path)

### Generate dataframe from Encode for selected Cell lines and Histone Marks

In [None]:
# extimated running time ~150 minutes

# load cell_lines
with open(data_base_path+'cells.csv', newline='') as f:
    reader = csv.reader(f)
    cells = list(reader)[0]

# load histone_marks
with open(data_base_path+'names.csv', newline='') as f:
    reader = csv.reader(f)
    marks = list(reader)[0]

# remote login as "Guest" in GMQL    
gl.set_mode("remote")
gl.set_remote_address("http://gmql.eu/gmql-rest/")
gl.login()

# load HG19_ROADMAP_EPIGENOMICS dataset from GMQL repository
narrow = gl.load_from_remote("HG19_ROADMAP_EPIGENOMICS_NARROW", owner="public")
broad = gl.load_from_remote("HG19_ROADMAP_EPIGENOMICS_BROAD", owner="public")
gapped = gl.load_from_remote("HG19_ROADMAP_EPIGENOMICS_GAPPED", owner="public")

# merge all datasets in one
narrow_broad = narrow.union(broad)
total = narrow_broad.union(gapped)

# filter the dataset for the selected cell lines 
cell_lines = total[total['epi__epigenome_id'].isin(cells)]

# filter the cell_lines for the selected histone marks 
histone_marks = cell_lines[cell_lines['exp__mark'].isin(marks)]

# materialize final results
hm = histone_marks.materialize(raw_data_path+"total_filtered", all_load=False)

### Convert the promoters' file (GeneFile.txt) in GMQL format (GTF)

In [None]:
# load Roadmap file
genefile = pd.read_csv(data_base_path+"GeneFile.txt", sep="\t", names = ["seqname", "start", "end", "attribute"])

# set distance from promoter
# promoters in file "GeneFile.txt" are ± 10k, resizing is needed
genefile.start = genefile.start + 5000
genefile.end = genefile.end - 5000

# add column to generate a full GTF format file
genefile["source"] = "Roadmap"
genefile["feature"] = "Gene"
genefile["score"] = "."
genefile["strand"] = "+"
genefile["frame"] = "."

# reorder column
genefile = genefile[['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']]

# save gtf file
genefile.to_csv(data_base_path+"GeneFile.gtf", index=False, sep ="\t", header=False)

### Intersect promoter regions with Encode data

In [None]:
gl.set_mode("local")

# generate custom parser for promoter dataset
custom_parser = gl.parsers.RegionParser(chrPos=0, startPos=3, stopPos=4, strandPos=6,
                                        otherPos=[(1, "source", "string"), 
                                                  (2, "feature", "string"),
                                                  (5, "score", "float"),
                                                  (7, "frame", "string"),
                                                  (8, "attribute", "string")],
                                        coordinate_system='1-based')

# import the previously generated promoter dataset in pyGMQL
promoter = gl.load_from_file(path=data_base_path+"GeneFile.gtf", parser=custom_parser)

# find the intersections
chip_promoter_0 = promoter.join(experiment=hm,
                         genometric_predicate=[gl.DL(0)],
                         output='RIGHT')

# keep only the needed region and metadata attributes
chip_promoter = chip_promoter_0.project(projected_meta=["EXP.epi__epigenome_id", "EXP.exp__mark", "EXP.manually_curated__format"], new_attr_dict=None, projected_regs=["EXP.signal","REF.attribute"], new_field_dict=None)

# materialize the results
results = chip_promoter.materialize(raw_data_path+"HG19_REMC_join_core_marks_all_tissues_dl0_1based")

# Re-organize information and extract highest signal per Gene

In [None]:
# get list of gmql output dir
gmql_path = raw_data_path+"HG19_REMC_join_core_marks_all_tissues" + "/files/"
gdm_list = list(set([i.split('.gdm', 1)[0] for i in os.listdir(gmql_path)]))

# filter only samples starting with "S_"
gdm_list_filtered = list(filter(re.compile(".*S_").match, gdm_list))

In [None]:
for sample in gdm_list_filtered:
    
    # open .meta file
    meta_file = pd.read_csv(gmql_path + sample + ".gdm.meta", sep="\t", index_col=0, header=None).T
    
    cell_line = meta_file['EXP.epi__epigenome_id'][1] 
    if not os.path.exists(extracted_path + cell_line):
        os.mkdir(extracted_path + cell_line)
    
    data_format = meta_file['EXP.manually_curated__format'][1] 
    if not os.path.exists(extracted_path + cell_line + "/" + data_format):
        os.makedirs(extracted_path + cell_line + "/" + data_format)
    
    mark = meta_file['EXP.exp__mark'][1]
    
    # open .gdm file
    gdm_file = pd.read_csv(gmql_path + sample + ".gdm", sep = "\t", names = ["chr", "start", "stop", "strand","score", "gene_name"])
    
    # extract highest signal per gene
    max_score = gdm_file.groupby(['gene_name'], as_index=False)['score'].max()
    max_score.to_csv(extracted_path + cell_line + "/" + data_format + "/" + mark + ".txt", sep="\t")

### Prepare data matrices for training

In [None]:
genes = pd.read_csv(data_base_path+"GeneFile.txt", sep="\t", names = ["seqname", "start", "end", "gene_name"], usecols=[3])
df_genes = pd.DataFrame(genes['gene_name'])

In [None]:
def generate_design_matrix(root_folder, cell_line, formats, histone_marks):
    
    genes = pd.read_csv(data_base_path+"GeneFile.txt", sep="\t", names = ["seqname", "start", "end", "gene_name"], usecols=[3])
    df_genes = pd.DataFrame(genes['gene_name'])

    cols = list()
    for m, mark in enumerate(histone_marks):
        path = root_folder + cell_line + "/" + formats[m] + "/" + mark + ".txt"
        df = pd.read_csv(path, sep="\t", header=0, dtype={'gene_name': str, 'score': np.float64}, usecols=[1,2])
        df_all = df_genes.merge(df, how='left', on='gene_name', ).fillna(value=0.0)
        col = df_all['score'].to_numpy().reshape(-1,1)
        cols.append(col)
    x = np.hstack(cols)
    
    return x

In [None]:
configurations = {
    '- broad': ['broadPeak', 'broadPeak', 'broadPeak', 'broadPeak', 'broadPeak'],
    '- narrow': ['narrowPeak', 'narrowPeak', 'narrowPeak', 'narrowPeak', 'narrowPeak'],
    '- gapped': ['gappedPeak', 'gappedPeak', 'gappedPeak', 'gappedPeak', 'gappedPeak'],
    '- broad narrow': ['broadPeak', 'broadPeak', 'narrowPeak', 'narrowPeak', 'broadPeak'],
    '- gapped broad': ['gappedPeak', 'gappedPeak', 'broadPeak', 'broadPeak', 'gappedPeak'],
    '- gapped narrow': ['gappedPeak', 'gappedPeak', 'narrowPeak', 'narrowPeak', 'gappedPeak']}

for conf in configurations:
    print('\n>> input configuration: ', conf)
    for cell in cells:
        print('\r\tcell: {}   '.format(cell), end='')
        result_folder = '{}/{}/{}/'.format(data_base_path, conf, cell)
        try:
            os.makedirs(result_folder)
        except OSError:
            pass
        x = generate_design_matrix(extracted_path, cell, configurations[conf], marks)
        np.savetxt(result_folder+'X.csv', x, delimiter=',')
        shutil.copy(data_base_path+'- targets/{}/T.csv'.format(cell), result_folder+'T.csv')