In [146]:
import pandas as pd

In [174]:
metadataindir="/Shares/down/public/HTP/RNAseq/inputdata/"
participanttable="include-participant-table-2023-08-22.tsv"
count_dir = "/Shares/down/public/HTP/RNAseq/outputdata/T21vsD21_noDNAdosagecorrection/"
count_file = count_dir+"normcounts.csv"
geneindir="/Shares/down/public/HTP/RNAseq/selfannotated/"
genefile = geneindir+"genes.csv"
outdir = "/Shares/down/public/HTP/RNAseq/selfannotated/AREA/"

In [175]:
metadatadf = pd.read_csv(metadataindir+participanttable, sep="\t")


In [176]:
def createbinary(metadatadf, importantcol, spliton, findcode, appendname):
    allphenotypes = metadatadf[importantcol].to_list()
    allphenotypes = [v.split(")") for v in allphenotypes]
    allphenotypes = [item.strip(", ") for sublist in allphenotypes for item in sublist]
    allphenotypes = sorted(list(set([v for v in allphenotypes if v.find(findcode)>-1])))
    df = pd.DataFrame(allphenotypes)
    df.columns = ["ori_name"]
    df[["phenotype", appendname]] = df["ori_name"].str.split(spliton, expand=True)
    df["colname"] = appendname+"_"+df["phenotype"].str.replace(" ", "_")
    new_columns = {}

    for _, row in df.iterrows():
        # Get the HP value and colname
        hp_value = row[appendname]
        colname = row['colname']
        
        # Apply the logic to check if the HP value is in the Phenotype (HPO) column
        if appendname=="HP":
            new_columns[colname] = metadatadf[importantcol].apply(
                lambda x: 1 if f"(HP:{hp_value})" in x else 0
            )
        if appendname=="MONDO":
            new_columns[colname] = metadatadf[importantcol].apply(
                lambda x: 1 if f"(MONDO:{hp_value})" in x else 0
            )

    # Concatenate the nenewmetadatadfw columns to metadatadf
    newmetadatadf = pd.concat([metadatadf, pd.DataFrame(new_columns)], axis=1)
    bianary_table = newmetadatadf[["Participant ID"]+list(new_columns.keys())]
    bianary_table.index = bianary_table["Participant ID"]
    bianary_table = bianary_table[list(new_columns.keys())]
    counteach = bianary_table.sum().to_frame()
    counteach.columns = ["counts"]
    counteach["colname"]= counteach.index
    df = df.merge(counteach, on="colname")
    return df, bianary_table

In [177]:
importantcol = "Diagnosis (Mondo)"
spliton = " \(MONDO:"
findcode = " (MONDO:"
appendname = "MONDO"
mondodf, mondobianary_table = createbinary(metadatadf, importantcol, spliton, findcode, appendname)

In [178]:
importantcol = "Phenotype (HPO)"
spliton = " \(HP:"
findcode = " (HP:"
appendname = "HP"
HPdf, HPbianary_table = createbinary(metadatadf, importantcol, spliton, findcode, appendname)

In [179]:
HPbianary_table["Patient"] = HPbianary_table.index
mondobianary_table["Patient"] = mondobianary_table.index

In [180]:
HPbianary_table = HPbianary_table.reset_index(drop=True)
mondobianary_table = mondobianary_table.reset_index(drop=True)
HPbianary_table = HPbianary_table[[colname for colname in HPbianary_table.columns if colname!="Participant ID"]]
mondobianary_table = mondobianary_table[[colname for colname in mondobianary_table.columns if colname!="Participant ID"]]


In [181]:
HPdf.to_csv(outdir+"full_HP_counts_colnames.csv")
HPbianary_table.to_csv(outdir+"full_HP_binary_attribute.csv")
mondodf.to_csv(outdir+"full_MONDO_counts_colnames.csv")
mondobianary_table.to_csv(outdir+"full_MONDO_binary_attribute.csv")

In [182]:
countdf = pd.read_csv(count_file, index_col=0)

value_df = countdf.T



In [183]:
value_df["Patient"] = value_df.index
value_df = value_df.reset_index(drop=True)
value_df = value_df[[colname for colname in value_df.columns if colname!="Participant ID"]]


In [184]:
value_df.to_csv(outdir+"RNAvalues_normcounts.csv")

In [185]:
participantswithRNA= list(value_df["Patient"])
participantswithconditions = list(mondobianary_table["Patient"])


participantswithRNA_notconditionstable = [participant for participant in participantswithRNA 
                                    if participant not in participantswithconditions]

participantswithRNA_conditions = [participant for participant in participantswithRNA 
                                    if participant in participantswithconditions]
 

In [186]:
len(participantswithRNA_notconditionstable)

0

In [187]:
len(participantswithRNA_conditions)

400

In [188]:
len(participantswithRNA)

400

In [189]:
len(participantswithconditions)

700

In [190]:
HPbianary_table_withRNA = HPbianary_table[HPbianary_table["Patient"].isin(participantswithRNA)]
mondobianary_table_withRNA = mondobianary_table[mondobianary_table["Patient"].isin(participantswithRNA)]


In [191]:
HPbianary_table.shape

(700, 152)

In [192]:
HPbianary_table_withRNA.shape

(400, 152)

In [193]:
mondobianary_table.shape

(700, 147)

In [194]:
mondobianary_table_withRNA.shape

(400, 147)

In [195]:

HPbianary_table_withRNA.to_csv(outdir+"HP_binary_attribute_withRNA.csv")
mondobianary_table_withRNA.to_csv(outdir+"MONDO_binary_attribute_withRNA.csv")

In [196]:
DS_conditions_cols = ["MONDO_Down_syndrome", "MONDO_mosaic_translocation_Down_syndrome", "MONDO_translocation_Down_syndrome", "MONDO_complete_trisomy_21", "MONDO_mosaic_trisomy_21"]



In [197]:
whichT21_withRNA = mondobianary_table_withRNA[DS_conditions_cols].copy()
whichT21_withRNA.index = mondobianary_table_withRNA["Patient"]

In [198]:
whichT21 = mondobianary_table[DS_conditions_cols].copy()
whichT21.index = mondobianary_table["Patient"]

In [199]:
whichT21_withRNA.loc[:, "Disomic"] = whichT21_withRNA[DS_conditions_cols].sum(axis=1) == 0
whichT21.loc[:, "Disomic"] = whichT21[DS_conditions_cols].sum(axis=1) == 0


In [200]:
whichT21_withRNA.to_csv(outdir+"whichT21_withRNA.csv")
whichT21.to_csv(outdir+"whichT21.csv")

In [201]:
completeT21 = whichT21_withRNA[whichT21_withRNA["MONDO_complete_trisomy_21"]==1]

In [202]:
completeT21HPbianary_table_withRNA = HPbianary_table_withRNA[HPbianary_table_withRNA["Patient"].isin(completeT21.index)]
completeT21mondobianary_table_withRNA = mondobianary_table_withRNA[mondobianary_table_withRNA["Patient"].isin(completeT21.index)]


In [203]:
completeT21mondobianary_table_withRNA.shape

(254, 147)

In [204]:
completeT21value_df = value_df[value_df["Patient"].isin(completeT21.index)]

In [205]:
completeT21value_df.shape

(254, 58090)

In [206]:
genesdf = pd.read_csv(genefile)
genesdf

Unnamed: 0.1,Unnamed: 0,seqnames,start,end,width,strand,source,type,gene_id,gene_version,gene_name,gene_source,gene_biotype
0,1,1,11869,14409,2541,+,havana,gene,ENSG00000223972,5,DDX11L1,havana,transcribed_unprocessed_pseudogene
1,2,1,14404,29570,15167,-,havana,gene,ENSG00000227232,5,WASH7P,havana,unprocessed_pseudogene
2,3,1,17369,17436,68,-,mirbase,gene,ENSG00000278267,1,MIR6859-1,mirbase,miRNA
3,4,1,29554,31109,1556,+,havana,gene,ENSG00000243485,5,MIR1302-2HG,havana,lincRNA
4,5,1,30366,30503,138,+,mirbase,gene,ENSG00000284332,1,MIR1302-2,mirbase,miRNA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58730,58731,KI270711.1,4612,29626,25015,-,ensembl,gene,ENSG00000271254,6,AC240274.1,ensembl,protein_coding
58731,58732,KI270713.1,21861,22024,164,-,ensembl,gene,ENSG00000275405,1,RF00003,ensembl,snRNA
58732,58733,KI270713.1,30437,30580,144,-,ensembl,gene,ENSG00000275987,1,RF00003,ensembl,snRNA
58733,58734,KI270713.1,31698,32528,831,-,ensembl,gene,ENSG00000277475,1,AC213203.2,ensembl,protein_coding


In [207]:
genesdfchr21 = genesdf[genesdf["seqnames"]=="21"]
genesdfchr21.shape

(833, 13)

In [208]:
keepcols = ["Patient"]+[genename for genename in completeT21value_df.columns if genename in genesdfchr21["gene_id"].to_list()]

completeT21value_df = completeT21value_df[keepcols]

In [209]:
completeT21value_df.shape

(254, 832)

In [210]:
completeT21value_df.to_csv(outdir+"values_chr21_onlycompleteT21.csv")

In [211]:
completeT21HPbianary_table_withRNA.to_csv(outdir+"completeT21HPbianary_table_withRNA.csv")
completeT21mondobianary_table_withRNA.to_csv(outdir+"completeT21mondobianary_table_withRNA.csv")
