In [1]:
import os


import pandas as pd


def _format_feature_dataset(filepath, feature):
    file_df = pd.read_table(filepath, header=None, usecols=[0, 1, 2, 3])
    n_rows, _ = file_df.shape
    file_df = file_df.assign(feature=pd.Series([feature] * n_rows))
    file_df = file_df.assign(metadata_index=pd.Series([index] * n_rows))
    return file_df


def metadata_to_dict(filepath):
    metadata = {}
    with open(filepath, "r") as fh:
        for line in fh:
            dataset, ds_info = line.split("\t")
            info_dict = {}
            key_vals = ds_info.split(";")
            for kv in key_vals:
                kv = kv.strip()
                key, val = kv.split("=")
                info_dict[key] = val
            metadata[dataset] = info_dict
    return metadata




In [2]:
features_file = "/home/tt419/Projects/DeepLearning/PhDeep/data/ENCODE_ftp/label_names.txt"
# http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeOpenChromDnase/
ENCODE_DNase = "/home/tt419/Projects/DeepLearning/PhDeep/data/ENCODE_ftp/DNAse/"
# http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeAwgTfbsUniform/
ENCODE_TF = "/home/tt419/Projects/DeepLearning/PhDeep/data/ENCODE_ftp/TFChip/"
# https://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/
Roadmap_Epi = "/home/tt419/Projects/DeepLearning/PhDeep/data/Selene/chromatin_profiles/Roadmap_Epigenomics/"
    
ENC_DNase_file = os.path.join(
        ENCODE_DNase, "files_DNAse.txt")
ENC_TF_file = os.path.join(
        ENCODE_TF, "files_TFChip.txt")
Roadmap_file = os.path.join(
        Roadmap_Epi,
        "jul2013.roadmapData.qc - "
        "Consolidated_EpigenomeIDs_summary_Table.tsv")


output_dir = "/rds-d5/project/who1000/rds-who1000-wgs10k/user/tt419/Selene_data/selene_ftp_output/"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [3]:
DNase_metadata = metadata_to_dict(ENC_DNase_file)
TFs_metadata = metadata_to_dict(ENC_TF_file)
Roadmap_metadata = pd.read_table(Roadmap_file)
Roadmap_metadata.set_index("Epigenome ID (EID)", inplace=True)


In [11]:
# unique features mapped to number of duplicates
deepsea_features = {}
# for index, (filename, info) in enumerate(DNase_metadata.items()):
#     # just to handle some edge cases
#     filename_split = filename.split(".")
#     if len(filename_split) > 3:
#         filename = "".join(filename_split[:-2]) + ".narrowPeak.gz"
#     filepath = os.path.join(ENCODE_DNase, filename)
#     feature = "{0}|DNase|{1}".format(info["cell"], info["treatment"])
#     if feature not in deepsea_features:
#         deepsea_features[feature] = 0
#     deepsea_features[feature] += 1

# for index, (filename, info) in enumerate(TFs_metadata.items()):
#     # just to handle some edge cases
#     filename_split = filename.split(".")
#     if len(filename_split) > 3:
#         filename = "".join(filename_split[:-2]) + ".narrowPeak.gz"
#     filepath = os.path.join(ENCODE_DNase, filename)
#     feature = "{0}|{1}|{2}".format(
#         info["cell"], info["antibody"].split('_')[0], info["treatment"])
#     if feature not in deepsea_features:
#         deepsea_features[feature] = 0
#     deepsea_features[feature] += 1
    
for index, filename in enumerate(os.listdir(Roadmap_Epi)):
    if ".narrowPeak.gz" not in filename:
        continue
    filepath = os.path.join(Roadmap_Epi, filename)
    filename = filename[:-len(".narrowPeak.gz")]
    EID, info = filename.split("-")

    row = Roadmap_metadata.loc[EID]
    # handling the edge cases
    cell_type = row.get("DONOR / SAMPLE ALIAS")
    if cell_type == "RO01746":
        cell_type = "Monocytes-CD14+RO01746 "
    if cell_type == "Osteobl":
        cell_type = "Osteoblasts"
    if "hESC-01" in cell_type:
        cell_type = "H1-hESC"

    if info == "H2A.Z":
        info = "H2AZ"

    feature = "{0}|{1}|None".format(cell_type, info)
    if feature not in deepsea_features:
        deepsea_features[feature] = 0
    deepsea_features[feature] += 1
    
all_features_with_dups = []
all_features = []




In [None]:
dfs_to_concat = []

# ENCODE DNase features
for index, (filename, info) in enumerate(DNase_metadata.items()):
    # just to handle some edge cases
    filename_split = filename.split(".")
    if len(filename_split) > 3:
        filename = "".join(filename_split[:-2]) + ".narrowPeak.gz"
    filepath = os.path.join(ENCODE_DNase, filename)
    feature = "{0}|DNase|{1}".format(info["cell"], info["treatment"])
    if feature not in deepsea_features:
        continue
    all_features_with_dups.append(feature)

    if feature in deepsea_features and deepsea_features[feature] > 1:
        deepsea_features[feature] -= 1
        feature = "{0}|{1}".format(feature, deepsea_features[feature])
    elif feature in deepsea_features and deepsea_features[feature] == 1:
        deepsea_features[feature] -= 1
    elif feature in deepsea_features and deepsea_features[feature] <= 0:
        continue
    all_features.append(feature)
    dfs_to_concat.append(_format_feature_dataset(
        filepath, feature))

DNase_agg = pd.concat(dfs_to_concat, ignore_index=True)
DNase_agg.sort_values([0, 1, 2], ascending=True, inplace=True)

print(DNase_agg.head())

In [5]:
dfs_to_concat = []

# ENCODE TF features
for index, (filename, info) in enumerate(TFs_metadata.items()):
    filepath = os.path.join(ENCODE_TF, filename)
    if not os.path.isfile(filepath):
        continue
    feature = "{0}|{1}|{2}".format(
        info["cell"], info["antibody"].split('_')[0], info["treatment"])
    if feature not in deepsea_features:
        continue
    all_features_with_dups.append(feature)
    if feature in deepsea_features and deepsea_features[feature] > 1:
        deepsea_features[feature] -= 1
        feature = "{0}|{1}".format(feature, deepsea_features[feature])
    elif feature in deepsea_features and deepsea_features[feature] == 1:
        deepsea_features[feature] -= 1
    elif feature in deepsea_features and deepsea_features[feature] <= 0:
        continue

    all_features.append(feature)
    dfs_to_concat.append(_format_feature_dataset(
        filepath, feature))



ChIP_agg = pd.concat(dfs_to_concat, ignore_index=True)
ChIP_agg.sort_values([0, 1, 2], ascending=True, inplace=True)
print(ChIP_agg.head())

             0      1      2  3             feature  metadata_index
4261851   chr1  10073  10329  .    K562|ZBTB33|None             246
3548416   chr1  10144  10263  .   HepG2|ZBTB33|None             204
5683966   chr1  10149  10413  .  H1-hESC|CEBPB|None             343
11985525  chr1  16110  16390  .   HCPEpiC|CTCF|None             662
3721671   chr1  16118  16362  .    K562|CTCF|None|3             213


In [6]:
full_aggregate_file = pd.concat(
    [DNase_agg, ChIP_agg], ignore_index=True) # , EID_agg deleted the EID_agg bit here, as we don't have the Roadmap Epigenomics right now
full_aggregate_file.columns = ["chrom", "start", "end", "strand", "feature", "metadata_index"]
output_file = os.path.join(output_dir, "selene_fullFeatures_unsorted.bed")
with open(output_file, 'a+') as file_handle: #changed to append, so that I can create it in steps
    for row in full_aggregate_file.itertuples():
        file_handle.write("{0}\t{1}\t{2}\t{3}\n".format(row.chrom, row.start, row.end, row.feature))

print("Total number of features: {0}".format(len(all_features)))
output_features = os.path.join(output_dir, "distinct_features.txt")
with open(output_features, 'a+') as file_handle: #changed to append, so that I can create it in steps
    features = sorted(all_features)
    for f in features:
        file_handle.write("{0}\n".format(f))

Total number of features: 815


In [7]:
del DNase_agg
del ChIP_agg

In [12]:

dfs_to_concat = []
EID_features = []
# Roadmap Epigenomic features (DNase, histone marks)
for index, filename in enumerate(os.listdir(Roadmap_Epi)):
    if ".narrowPeak.gz" not in filename:
        continue
    filepath = os.path.join(Roadmap_Epi, filename)
    filename = filename[:-len(".narrowPeak.gz")]
    EID, info = filename.split("-")

    row = Roadmap_metadata.loc[EID]
    # handling the edge cases
    cell_type = row.get("DONOR / SAMPLE ALIAS")
    if cell_type == "RO01746":
        cell_type = "Monocytes-CD14+RO01746 "
    if cell_type == "Osteobl":
        cell_type = "Osteoblasts"
    if "hESC-01" in cell_type:
        cell_type = "H1-hESC"

    if info == "H2A.Z":
        info = "H2AZ"

    feature = "{0}|{1}|None".format(cell_type, info)

    if feature not in deepsea_features:
        continue
    all_features_with_dups.append(feature)
    if feature in deepsea_features and deepsea_features[feature] > 1:
        deepsea_features[feature] -= 1
        feature = "{0}|{1}".format(feature, deepsea_features[feature] + 1)
    elif feature in deepsea_features and deepsea_features[feature] == 1:
        deepsea_features[feature] -= 1
    elif feature in deepsea_features and deepsea_features[feature] <= 0:
        continue
    EID_features.append(feature)
    all_features.append(feature)
    dfs_to_concat.append(_format_feature_dataset(
        filepath, feature).sort_values([0, 1, 2], ascending=True))

# EID_agg = pd.concat(dfs_to_concat, ignore_index=True)
# EID_agg.sort_values([0, 1, 2], ascending=True, inplace=True)
# print(EID_agg.head())

In [None]:
for EID_agg in dfs_to_concat:
    full_aggregate_file = EID_agg
    full_aggregate_file.columns = ["chrom", "start", "end", "strand", "feature", "metadata_index"]
    output_file = os.path.join(output_dir, "selene_fullFeatures_unsorted.bed")
    with open(output_file, 'a+') as file_handle: #changed to append, so that I can create it in steps
        for row in full_aggregate_file.itertuples():
            file_handle.write("{0}\t{1}\t{2}\t{3}\n".format(row.chrom, row.start, row.end, row.feature))

print("Total number of features: {0}".format(len(all_features)))
output_features = os.path.join(output_dir, "distinct_features.txt")
with open(output_features, 'a+') as file_handle: #changed to append, so that I can create it in steps
    features = sorted(EID_features)
    for f in features:
        file_handle.write("{0}\n".format(f))

In [None]:

dfs_to_concat = []
EID_features = []
# Roadmap Epigenomic features (DNase, histone marks)
for index, filename in enumerate(os.listdir(Roadmap_Epi)):
    if ".narrowPeak.gz" not in filename:
        continue
    filepath = os.path.join(Roadmap_Epi, filename)
    filename = filename[:-len(".narrowPeak.gz")]
    EID, info = filename.split("-")

    row = Roadmap_metadata.loc[EID]
    # handling the edge cases
    cell_type = row.get("DONOR / SAMPLE ALIAS")
    if cell_type == "RO01746":
        cell_type = "Monocytes-CD14+RO01746 "
    if cell_type == "Osteobl":
        cell_type = "Osteoblasts"
    if "hESC-01" in cell_type:
        cell_type = "H1-hESC"

    if info == "H2A.Z":
        info = "H2AZ"

    feature = "{0}|{1}|None".format(cell_type, info)

    if feature not in deepsea_features:
        continue
    all_features_with_dups.append(feature)
    if feature in deepsea_features and deepsea_features[feature] > 1:
        deepsea_features[feature] -= 1
        feature = "{0}|{1}".format(feature, deepsea_features[feature] + 1)
    elif feature in deepsea_features and deepsea_features[feature] == 1:
        deepsea_features[feature] -= 1
    elif feature in deepsea_features and deepsea_features[feature] <= 0:
        continue
    EID_features.append(feature)
    all_features.append(feature)
    
    full_aggregate_file = _format_feature_dataset(
        filepath, feature).sort_values([0, 1, 2], ascending=True)
    full_aggregate_file.columns = ["chrom", "start", "end", "strand", "feature", "metadata_index"]
    output_file = os.path.join(output_dir, "selene_fullFeatures_unsorted.bed")
    with open(output_file, 'a+') as file_handle: #changed to append, so that I can create it in steps
        for row in full_aggregate_file.itertuples():
            file_handle.write("{0}\t{1}\t{2}\t{3}\n".format(row.chrom, row.start, row.end, row.feature))
    
# EID_agg = pd.concat(dfs_to_concat, ignore_index=True)
# EID_agg.sort_values([0, 1, 2], ascending=True, inplace=True)
# print(EID_agg.head())