In [310]:
import pandas as pd
from collections import Counter
import plotly.express as px

# Set file paths

In [311]:
count_dir = "/Shares/down/public/INLCUDE_2024/kallisto_20241030/kallisto_to_deseq_20241230/"
countsetname = "kallisto_200401lines"
count_file = count_dir+"kallisto_to_DESeq_normcounts_200401.csv"
tpm_file = count_dir+"kallisto_to_DESeq_rawcounts_200401.csv"


In [312]:
metadataindir="/Shares/down/public/INLCUDE_2024/kallisto_20241030/metadata/"
familymanifest = "include_familyManifest_20241029T155031Z.tsv"
clinicalfile = "include_familyClinicalData_20241028T223209Z.xlsx"
biospeciman = "include_biospecimenData_20241028T223221Z.xlsx"
outdir = "/Shares/down/public/INLCUDE_2024/kallisto_20241030/selfannoated/"
geneindir= outdir
genefile = geneindir+"genes.csv"

# Read in files

In [313]:
clinicalxls = pd.ExcelFile(metadataindir+clinicalfile)#Phenotypes
Phenotypesdf = pd.read_excel(clinicalxls, 'Phenotypes')
Diagnosesdf = pd.read_excel(clinicalxls, 'Diagnoses')
Participantsdf = pd.read_excel(clinicalxls, 'Participants')

In [314]:
biospecimanxls = pd.ExcelFile(metadataindir+biospeciman)#Phenotypes
Biospecimensdf = pd.read_excel(biospecimanxls, 'Biospecimens')

In [315]:
familymanifestdf = pd.read_csv(metadataindir+familymanifest, sep="\t")

familymanifestdf.columns

Index(['Access URL', 'File ID', 'File Name', 'File Size', 'Data Category',
       'Data Type', 'File Format', 'Experimental Strategy', 'Hash',
       'Study Name', 'Participant ID', 'Sample ID', 'INCLUDE Container ID',
       'Down Syndrome Status', 'Family ID', 'External Participant ID',
       'External Sample ID', 'External Collection ID'],
      dtype='object')

In [316]:
switchdf = familymanifestdf[["File Name", "Participant ID"]].copy().drop_duplicates()
switchdf["Participant"] = switchdf["Participant ID"]
switchdf["FileName"] = switchdf["File Name"]
switchdf = switchdf[["FileName", "Participant"]]

In [317]:
def createbinary(metadatadf, importantcol, spliton, findcode, appendname):
    allphenotypes = metadatadf[importantcol].unique()
    allphenotypes = [v for v in allphenotypes if type(v)!=float]
    allphenotypes = [v.split(")") for v in allphenotypes]
    allphenotypes = [item.strip(", ") for sublist in allphenotypes for item in sublist]
    allphenotypes = sorted(list(set([v for v in allphenotypes if v.find(findcode)>-1])))
    df = pd.DataFrame(allphenotypes)
    df.columns = ["ori_name"]
    df[["phenotype", appendname]] = df["ori_name"].str.split(spliton, expand=True)
    df["colname"] = appendname+"_"+df["phenotype"].str.replace(" ", "_")
    pairsmondocolname = list(zip(df[appendname], df['colname']))
    samples = metadatadf["Participant ID"].unique()
    bianary_table = pd.DataFrame(samples)
    bianary_table.columns = ["Participant"]
    bianary_table.index = bianary_table["Participant"]
    listcolspresent = []
    for mondocode, colname in pairsmondocolname:
        mask = metadatadf[importantcol].str.contains(str(mondocode), na=False)
        hasitdf_df = metadatadf[mask]
        presentdf = bianary_table["Participant"].isin(hasitdf_df["Participant ID"]).to_frame()
        presentdf.columns = [colname]
        listcolspresent.append(presentdf)
    bianary_table = pd.concat(listcolspresent, axis=1, join="inner")  # or join="outer"
    counteach = bianary_table.sum().to_frame()
    counteach.columns = ["counts"]
    counteach["colname"]= counteach.index
    df = df.merge(counteach, on="colname")
    return df, bianary_table

def participants_to_filenames(familymanifestdf, bianary_table):
    newnames = bianary_table.merge(switchdf, on="Participant", how="outer")
    newnames = newnames.drop(columns=['Participant'])
    return newnames
    
def filenames_to_participants(familymanifestdf, value_table):
    newnames = value_table.merge(switchdf, on="FileName", how="inner")
    newnames = newnames.drop(columns=['FileName'])
    return newnames
    

In [318]:
importantcol = " Diagnosis (MONDO)"
spliton = " \(MONDO:"
findcode = " (MONDO:"
appendname = "MONDO"
mondodf, mondobianary_table = createbinary(Diagnosesdf, importantcol, spliton, findcode, appendname)

In [319]:
importantcol = "Phenotype (HPO)"
spliton = " \(HP:"
findcode = " (HP:"
appendname = "HP"
HPdf, HPbianary_table = createbinary(metadatadf, importantcol, spliton, findcode, appendname)

In [320]:
HPbianary_table["Participant"] = HPbianary_table.index
mondobianary_table["Participant"] = mondobianary_table.index


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [321]:
HPbianary_table = HPbianary_table.reset_index(drop=True)
mondobianary_table = mondobianary_table.reset_index(drop=True)
HPbianary_table = HPbianary_table[[colname for colname in HPbianary_table.columns if colname!="Participant ID"]]
mondobianary_table = mondobianary_table[[colname for colname in mondobianary_table.columns if colname!="Participant ID"]]


In [322]:
HPdf.to_csv(outdir+"full_HP_counts_colnames.csv")
HPbianary_table.to_csv(outdir+"full_HP_binary_attribute.csv")
mondodf.to_csv(outdir+"full_MONDO_counts_colnames.csv")
mondobianary_table.to_csv(outdir+"full_MONDO_binary_attribute.csv")

In [323]:
# Using participants_to_filenames means participants are in the boolean file more than once! 
#( Each partcipant is in the HPbianary_table_filename once for each file we have on them. And most Participants we have at least 3 files for.
#So it is  your job to make sure you only have one file per sample in AREA!
#I did that by making sure only to get one RNA-seq counts per person in the counts files I made.

HPbianary_table_filename = participants_to_filenames(familymanifestdf, HPbianary_table)
HPbianary_table_filename.to_csv(outdir+"filename_HP_binary_attribute.csv")
MONDObianary_table_filename = participants_to_filenames(familymanifestdf, mondobianary_table)
MONDObianary_table_filename.to_csv(outdir+"filename_MONDO_binary_attribute.csv")

In [324]:
mini_HP = HPbianary_table[["HP_Obstructive_sleep_apnea", "HP_Obesity","HP_Depression", "Participant"]]
mini_HP.to_csv(outdir+"mini_HP_binary_attribute.csv")
mini_filenameHP = HPbianary_table_filename[["HP_Obstructive_sleep_apnea", "HP_Obesity","HP_Depression", "FileName"]]
mini_filenameHP.to_csv(outdir+"minifilename_HP_binary_attribute.csv")

# Now make the rank files

In [325]:
#tpmdf = pd.read_csv(tpm_file, index_col=0)
#value_df = tpmdf.T
#value_df["FileName"] = value_df.index
#value_df = value_df.reset_index(drop=True)
#value_df = value_df[[colname for colname in value_df.columns if colname!="sample"]]
#value_df.to_csv(outdir+countsetname+"_tpm.csv")


In [326]:
countdf = pd.read_csv(count_file, index_col=0)
value_df = countdf.T
value_df["FileName"] = value_df.index
value_df = value_df.reset_index(drop=True)
#value_df = value_df[[colname for colname in value_df.columns if colname!="sample"]]
value_df.to_csv(outdir+countsetname+"_filename_normcounts.csv")


In [327]:
#vcolnames = [colname for colname in value_df.columns if colname.find("ENSG00000001167")>-1]
#
#len(list(set(vcolnames)))
#vcolnames

# How many files are there per person

In [328]:
unique_file_counts_per_Participant = (
    switchdf.groupby("Participant")["FileName"]
      .nunique()
      .reset_index(name="UniqueFileCount")
)

switchdf_withRNA = switchdf.merge(value_df, on="FileName", how="inner")

unique_file_counts_per_Participant_withRNA = (
    switchdf_withRNA.groupby("Participant")["FileName"]
      .nunique()
      .reset_index(name="UniqueFileCount")
)


In [329]:
fig = px.histogram(unique_file_counts_per_Participant_withRNA, x="UniqueFileCount")
fig.show()

In [330]:
fig = px.histogram(unique_file_counts_per_Participant, x="UniqueFileCount")
fig.show()

In [331]:
value_df_participants = value_df.copy()
value_df_participants = filenames_to_participants(familymanifestdf,value_df_participants)
value_df_participants.to_csv(outdir+countsetname+"_participants_normcounts.csv")

In [332]:
value_df_participants

Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000000938.12,ENSG00000000971.15,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,...,ENSG00000284740.1,ENSG00000284741.1,ENSG00000284742.1,ENSG00000284743.1,ENSG00000284744.1,ENSG00000284745.1,ENSG00000284746.1,ENSG00000284747.1,ENSG00000284748.1,Participant
0,29.516858,0.0,677.204855,874.335011,150.102000,26338.795021,95.040010,578.754270,867.085877,1364.030438,...,0.857093,0.0,0.0,0.0,0.000000,0.0,0.0,63.438383,15.894703,pt-mv6fmz83
1,47.883863,0.0,878.701066,930.485510,183.874029,15675.943089,78.431266,537.627779,961.319230,1498.566947,...,0.796807,0.0,0.0,0.0,5.550311,0.0,0.0,54.183413,5.539728,pt-pb67ypn9
2,38.107012,0.0,822.215303,899.600053,188.989659,20846.982160,251.590331,643.061258,685.564326,1517.044532,...,1.640181,0.0,0.0,0.0,7.626614,0.0,0.0,51.770695,2.855719,pt-ekjjfcr8
3,7.904249,0.0,488.443725,818.246942,73.996124,33611.415617,109.850041,851.848248,508.127917,1302.990608,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,48.390434,0.000000,pt-yky3kj4e
4,17.681451,0.0,929.728834,918.153598,192.989845,24482.400404,72.551148,650.752961,1109.044383,1457.659293,...,0.000000,0.0,0.0,0.0,5.468876,0.0,0.0,35.101428,1.565382,pt-bkgw6a6n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,17.242527,0.0,1031.678033,883.659222,167.326793,33234.243367,128.467550,997.792260,483.279285,1318.284839,...,13.107993,0.0,0.0,0.0,18.076245,0.0,0.0,52.627810,4.519997,pt-e3jrm51z
396,41.884893,0.0,963.949006,935.839347,217.069664,23896.117663,113.580935,1096.749424,875.008737,1561.746532,...,2.525113,0.0,0.0,0.0,8.818780,0.0,0.0,45.538950,8.077822,pt-be1stwmv
397,36.321957,0.0,729.360879,954.755252,190.266974,28283.049573,70.944267,864.300538,721.002235,1390.776898,...,20.806912,0.0,0.0,0.0,1.051644,0.0,0.0,72.997090,4.220290,pt-jqfvet2k
398,30.222321,0.0,831.888806,939.305255,125.391507,27279.069415,131.780073,509.917600,775.287235,1353.484688,...,0.000000,0.0,0.0,0.0,2.059527,0.0,0.0,54.945480,4.101312,pt-gx8psrwq


In [333]:

mini_value_df = value_df[["FileName", "ENSG00000124813.20", "ENSG00000159216.18", "ENSG00000001167.14"]]
mini_value_df.to_csv(outdir+"miniRNAvalues_filename_normcounts.csv")
mini_value_df = value_df_participants[["Participant", "ENSG00000124813.20", "ENSG00000159216.18", "ENSG00000001167.14"]]
mini_value_df.to_csv(outdir+"miniRNAvalues_participants_normcounts.csv")

In [334]:
fnwithRNA= list(value_df["FileName"])
fnwithconditions = list(MONDObianary_table_filename["FileName"])


fnwithRNA_notconditionstable = [fn for fn in fnwithRNA 
                                    if fn not in fnwithconditions]

fnwithRNA_conditions = [fn for fn in fnwithRNA 
                                    if fn in fnwithconditions]

fnwithRNA = pd.DataFrame(fnwithRNA)
fnwithRNA.to_csv(outdir+"include_filename_with_RNA.csv", header=False, index=False)

print("fnwithRNA_notconditionstable", len(fnwithRNA_notconditionstable))
print("fnwithRNA_conditions", len(fnwithRNA_conditions))
print("fnwithRNA", len(fnwithRNA))
print("fnwithconditions", len(fnwithconditions))


fnwithRNA_notconditionstable 0
fnwithRNA_conditions 400
fnwithRNA 400
fnwithconditions 3797


In [335]:
participantswithRNA= list(value_df_participants["Participant"])
participantswithconditions = list(mondobianary_table["Participant"])


participantswithRNA_notconditionstable = [participant for participant in participantswithRNA 
                                    if participant not in participantswithconditions]

participantswithRNA_conditions = [participant for participant in participantswithRNA 
                                    if participant in participantswithconditions]

participantswithRNA = pd.DataFrame(participantswithRNA)
participantswithRNA.to_csv(outdir+"include_participant_with_RNA.csv", header=False, index=False)

print("participantswithRNA_notconditionstable", len(participantswithRNA_notconditionstable))
print("participantswithRNA_conditions", len(participantswithRNA_conditions))
print("participantswithRNA", len(participantswithRNA))
print("participantswithconditions", len(participantswithconditions))


participantswithRNA_notconditionstable 0
participantswithRNA_conditions 400
participantswithRNA 400
participantswithconditions 996


# which T21 files

In [336]:
DS_conditions_cols = ["MONDO_Down_syndrome", "MONDO_mosaic_translocation_Down_syndrome", "MONDO_translocation_Down_syndrome", "MONDO_complete_trisomy_21", "MONDO_mosaic_trisomy_21"]
whichT21fn = MONDObianary_table_filename[DS_conditions_cols].copy()
whichT21fn.index = MONDObianary_table_filename["FileName"]
whichT21fn.loc[:, "Disomic"] = whichT21fn[DS_conditions_cols].sum(axis=1) == 0
whichT21fn.to_csv(outdir+"whichT21_filename.csv")


In [337]:
DS_conditions_cols = ["MONDO_Down_syndrome", "MONDO_mosaic_translocation_Down_syndrome", "MONDO_translocation_Down_syndrome", "MONDO_complete_trisomy_21", "MONDO_mosaic_trisomy_21"]
whichT21p = mondobianary_table[DS_conditions_cols].copy()
whichT21p.index = mondobianary_table["Participant"]
whichT21p.loc[:, "Disomic"] = whichT21p[DS_conditions_cols].sum(axis=1) == 0
whichT21p.to_csv(outdir+"whichT21_participant.csv")


# file for complete T21 participants and filenames

In [338]:
completeT21fn =  whichT21fn[whichT21fn["MONDO_complete_trisomy_21"]==1]
completeT21p =  whichT21p[whichT21p["MONDO_complete_trisomy_21"]==1]
participantswithRNAandcompleteT21 = participantswithRNA[participantswithRNA[0].isin(completeT21p.index)]
participantswithRNAandcompleteT21.to_csv(outdir+"include_participants_with_RNA_and_completeT21.csv", header=False, index=False)
fnwithRNAandcompleteT21 = fnwithRNA[fnwithRNA[0].isin(completeT21fn.index)]
fnwithRNAandcompleteT21.to_csv(outdir+"include_filenames_with_RNA_and_completeT21.csv", header=False, index=False)


In [339]:
# Chr21 genes

In [340]:
genesdf = pd.read_csv(genefile)
genesdf

Unnamed: 0.1,Unnamed: 0,seqnames,start,end,width,strand,source,type,gene_id,gene_version,gene_name,gene_source,gene_biotype
0,1,1,11869,14409,2541,+,havana,gene,ENSG00000223972,5,DDX11L1,havana,transcribed_unprocessed_pseudogene
1,2,1,14404,29570,15167,-,havana,gene,ENSG00000227232,5,WASH7P,havana,unprocessed_pseudogene
2,3,1,17369,17436,68,-,mirbase,gene,ENSG00000278267,1,MIR6859-1,mirbase,miRNA
3,4,1,29554,31109,1556,+,havana,gene,ENSG00000243485,5,MIR1302-2HG,havana,lincRNA
4,5,1,30366,30503,138,+,mirbase,gene,ENSG00000284332,1,MIR1302-2,mirbase,miRNA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58730,58731,KI270711.1,4612,29626,25015,-,ensembl,gene,ENSG00000271254,6,AC240274.1,ensembl,protein_coding
58731,58732,KI270713.1,21861,22024,164,-,ensembl,gene,ENSG00000275405,1,RF00003,ensembl,snRNA
58732,58733,KI270713.1,30437,30580,144,-,ensembl,gene,ENSG00000275987,1,RF00003,ensembl,snRNA
58733,58734,KI270713.1,31698,32528,831,-,ensembl,gene,ENSG00000277475,1,AC213203.2,ensembl,protein_coding


In [341]:
genesdfchr21 = genesdf[genesdf["seqnames"]=="21"]
genesdfchr21.shape

(833, 13)

In [342]:
genenamechr21 = genesdfchr21[["gene_id"]].copy()
genenamechr21.to_csv(outdir+"include_rank_file_chr21only.csv", header=False, index=False)

In [343]:
geneids_RNAdf = pd.DataFrame([gid for gid in value_df.columns if gid!="FileName"])
geneids_RNAdf.columns = ["gene_id_version"]
geneids_RNAdf[['gene_id', 'RNA_gene_version']] = geneids_RNAdf['gene_id_version'].str.split('.', n=1, expand=True)
geneids_RNAdf = geneids_RNAdf.merge(genesdfchr21, how="inner", on="gene_id")

In [344]:
geneids_RNAdf["gene_id_version"]

0      ENSG00000141956.13
1      ENSG00000141959.16
2       ENSG00000142149.8
3      ENSG00000142156.14
4      ENSG00000142166.12
              ...        
826     ENSG00000283051.1
827     ENSG00000283300.1
828     ENSG00000283904.1
829     ENSG00000284448.1
830     ENSG00000284550.1
Name: gene_id_version, Length: 831, dtype: object

In [345]:
geneids_RNAdf[["gene_id_version"]].to_csv(outdir+"include_rank_cols_chr21.csv", header=False, index=False)
