In [1]:
import os
import re
import sys
import glob
import time
import numpy as np
import pandas as pd

In [2]:
data_dir = "/home/galaxy/project/m6AQTL/data/data_for_website/final_data/search_by_QTL/raw_result"
data_list = [x for x in glob.glob("%s/*.txt" % data_dir) if "QTL.txt" not in x] ## modified ##
result_dir = "/home/galaxy/project/m6AQTL/data/data_for_website/final_data/search_by_QTL/formatted_result"

In [3]:
print(data_list)

['/home/galaxy/project/m6AQTL/data/data_for_website/final_data/search_by_QTL/raw_result/clinvar.txt', '/home/galaxy/project/m6AQTL/data/data_for_website/final_data/search_by_QTL/raw_result/COSMIC.txt', '/home/galaxy/project/m6AQTL/data/data_for_website/final_data/search_by_QTL/raw_result/GWAS.txt']


In [4]:
def merge_multi_into_oneline(df):
#     print(df.head())
    merged_array, term_list = [], []
    columns = df.columns
    for i_col in columns:
        if ("m6A_Peak#" in i_col) or ("PseudoU_Peak#" in i_col) or ("m5C_Peak#" in i_col):
            term_list.append("?".join(list(set(df[i_col].tolist()))))
        else:
            term_list.append("@".join(list(set(df[i_col].tolist()))))
    merged_array.append(term_list)
    df_result = pd.DataFrame(merged_array, columns=columns)
    return df_result

In [5]:
def keep_the_col_same(df):
    order_col = ['ID',
     'm6A_Peak#ID#Conservation score#Spectrum width score#Gene_Name|Ensembl_Gene_ID|Gene_Type|Strand|Supported_sample(s)|PubmedID|source|Tissue',
     'PseudoU_Peak#ID#Conservation score#Spectrum width score#Gene_Name|Ensembl_Gene_ID|Gene_Type|Strand|Supported_sample(s)|PubmedID|source|Tissue',
     'm5C_Peak#ID#Conservation score#Spectrum width score#Gene_Name|Ensembl_Gene_ID|Gene_Type|Strand|Supported_sample(s)|PubmedID|source|Tissue',
     'eQTL_Position|eQTL_Gene_Name|eQTL_RS_ID|eQTL_eQTL_type|eQTL_Pvalue|eQTL_Cell_or_tissue|eQTL_Condition|eQTL_PubMed_ID',
     'meQTL_Position|meQTL_Gene_Name|meQTL_ID|meQTL_Probe_Name|meQTL_RS_ID|meQTL_meQTL_type|meQTL_Pvalue|meQTL_Cell_or_tissue|meQTL_Condition|meQTL_PubMed_ID',
     'pQTL_Position|pQTL_Gene_Name|pQTL_Uniprot_ID|pQTL_RS_ID|pQTL_pQTL_type|pQTL_Pvalue|pQTL_Cell_or_tissue|pQTL_Condition|pQTL_PubMed_ID',
     'sQTL_Position|sQTL_Gene_Name|sQTL_Splicing_sites_Position|sQTL_Splicing_ID|sQTL_Event_type|sQTL_RS_ID|sQTL_sQTL_type|sQTL_Pvalue|sQTL_Cell_or_tissue|sQTL_Condition|sQTL_PubMed_ID',
     'circQTL_Position|circQTL_Gene_Name|circQTL_circRNA_ID|circQTL_RS_ID|circQTL_circQTL_type|circQTL_Pvalue|circQTL_Cell_or_tissue|circQTL_Condition|circQTL_PubMed_ID',
     'GWAS_Position|GWAS_Rs_ID|GWAS_Pubmed_ID|GWAS_Disease/Trait|GWAS_P_value|GWAS_Database|GWAS_Type|GWAS_TagSNP',
     'clinvar_Position|clinvar_rs_ID|clinvar_Significant|clinvar_Disease(s)|clinvar_Accession|clinvar_Study_name|clinvar_Review_status',
     'COSMIC_Position|COSMIC_RS_ID|COSMIC_Gene_name|COSMIC_Primary_site|COSMIC_Mutation_Description|COSMIC_Mutation_zygosity|COSMIC_FATHMM_prediction|COSMIC_Pubmed_ID',
     'RBP_Position|RBP_RBP|RBP_Species|RBP_Database|RBP_Study',
     'splicing_sites_Gene_ID|splicing_sites_Gene_Name|splicing_sites_Gene_Type|splicing_sites_Genomic_location|splicing_sites_Splicing_Site|splicing_sites_Relative_Position',
     'miRNA_Position|miRNA_Strand|miRNA_miRNA-RNA|miRNA_Target_RNA|miRNA_Target_RNA_Type|miRNA_Method|miRNA_Source']
    col_list = df.columns
#     five_qlt_cols = [x for x in col_list if "QTL" not in x]  #### modify #### 
#     print(len(five_qlt_cols))
    first_col, second_col = col_list[0], col_list[1]
    new_first_col = "|".join(["_".join(x.split("_")[1:]) for x in first_col.split("|")])
    df = df.rename(index=str, columns={first_col: new_first_col})
#     remain_cols = col_list[2:]
#     lost_col = [x for x in five_qlt_cols if x not in remain_cols][0]
    lost_col = col_list[0]
    print(lost_col)
    df[lost_col] = "NA"
    df_new = pd.DataFrame()
    for x in ([new_first_col]+order_col):
        df_new[x] = df[x]
    return df_new

In [6]:
for x in data_list:
    print(x)
    result_file = os.path.join(result_dir, os.path.basename(x))
    df = pd.read_table(x, sep="\t")
    df = df.fillna("NA").replace("clinvar_", "ClinVar_")
    col_name = df.columns[0]
    prefix = col_name.split("_")[0]
    # GWAS, ClinVar and COSMIC
    if ("TagSNP" in col_name) or ("Disease(s)" in col_name) or ("Primary_site" in col_name):
        df.insert(loc=0, column="Disease", value=[x.split("|")[3] for x in df[col_name]])
        col_name = "Disease"
    df_final = df.groupby([col_name]).apply(merge_multi_into_oneline)
    if "Disease" in df_final.columns:
        del df_final["Disease"]
        df_final["ID"] = ["%s_ID_%s" % (prefix, str(num).zfill(len(str(len(df_final))))) for num in range(1, len(df_final)+1)]
    df_final = keep_the_col_same(df_final)
    df_final.to_csv(result_file, sep="\t", index=False)

/home/galaxy/project/m6AQTL/data/data_for_website/final_data/search_by_QTL/raw_result/clinvar.txt


  interactivity=interactivity, compiler=compiler, result=result)


clinvar_Position|clinvar_rs_ID|clinvar_Significant|clinvar_Disease(s)|clinvar_Accession|clinvar_Study_name|clinvar_Review_status
/home/galaxy/project/m6AQTL/data/data_for_website/final_data/search_by_QTL/raw_result/COSMIC.txt


  interactivity=interactivity, compiler=compiler, result=result)


COSMIC_Position|COSMIC_RS_ID|COSMIC_Gene_name|COSMIC_Primary_site|COSMIC_Mutation_Description|COSMIC_Mutation_zygosity|COSMIC_FATHMM_prediction|COSMIC_Pubmed_ID
/home/galaxy/project/m6AQTL/data/data_for_website/final_data/search_by_QTL/raw_result/GWAS.txt


  interactivity=interactivity, compiler=compiler, result=result)


GWAS_Position|GWAS_Rs_ID|GWAS_Pubmed_ID|GWAS_Disease/Trait|GWAS_P_value|GWAS_Database|GWAS_Type|GWAS_TagSNP
