In [1]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from IPython.display import display, HTML
import json

## Reading Clinical Annotation

In [21]:
df = pd.read_csv('04.14.csv', low_memory=False)
df = df.loc[~df.isnull().all(axis=1)]

gene_cols = range(30,len(df.columns))
gene_names = df.columns[gene_cols].tolist()
gene_names.extend(["DNMT3A", "CEBPA", "FLT3", "NPM1"])

In [20]:
gene_names

['ABL1',
 'ASXL1',
 'ATRX',
 'BCORL1',
 'BCOR',
 'BRAF',
 'CALR',
 'CBLB',
 'CBLC',
 'CBL',
 'CDKN2A',
 'CEBPAResult',
 'CSF3R',
 'CUX1',
 'CXCR4',
 'DDX41',
 'DNMT3AResult',
 'ETNK1',
 'EZH2',
 'FBXW7',
 'FGFR1',
 'FLT3-ITD',
 'FLT3TKD',
 'GATA1',
 'GATA2',
 'HRAS',
 'IDH1',
 'IDH2',
 'JAK',
 'KDM6A',
 'KIT',
 'KMT2A',
 'KRAS',
 'MAP2K1',
 'MPL',
 'MYD88',
 'NF1',
 'NOTCH1',
 'NPM1Result',
 'NRAS',
 'PDGFRA',
 'PDGFRB',
 'PHF6',
 'PPM1D',
 'PTEN',
 'PTPN11',
 'RAD21',
 'RB1',
 'RUNX1',
 'SAMD9L',
 'SAMD9',
 'SETBP1',
 'SF3B1',
 'SH2B3',
 'SMC1A',
 'SMC3',
 'SRSF2',
 'STAG2',
 'STAT3',
 'STAT5B',
 'TET2',
 'TP53',
 'U2AF1',
 'U2AF2',
 'WT1',
 'ZRSR2',
 'DNMT3A',
 'CEBPA',
 'FLT3',
 'NPM1']

## Reading Annovar Output Files


In [24]:
samples = os.listdir('output')

combined_df = pd.DataFrame()

for sample in samples:
    df = pd.read_csv(f'output/{sample}/{sample}.combined_variants.hg38_multianno.txt', sep='\t')
    df['sampleID'] = sample
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# filter based on interested genes
combined_df = combined_df.loc[combined_df["Gene.refGene"].isin(gene_names)]


In [26]:
combined_df.head()

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,knownGene,cosmic70,esp6500siv2_all,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG,AF,AF_popmax,AF_male,AF_female,AF_raw,AF_afr,AF_sas,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13,sampleID
0,chr1,36471458,36471458,A,G,exonic,CSF3R,.,synonymous SNV,"CSF3R:NM_000760:exon10:c.T1260C:p.T420T,CSF3R:...",.,.,0.6086,249885,not_specified,MedGen:CN169374,"criteria_provided,_multiple_submitters,_no_con...",Benign,0.5536,0.7262,0.5647,0.5405,0.5536,0.7262,0.6844,0.3967,0.4081,0.5551,0.6063,0.5446,0.5659,0.7286,0.726,0.7273,0.7318,1.0,.,57,chr1,36471458,.,A,G,.,PASS,ADP=57;WT=0;HET=0;HOM=1;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:57:57:0:57:100%:6.4572E-34:0:36:0:0:29...,RO10001
1,chr1,36471464,36471464,A,G,exonic,CSF3R,.,synonymous SNV,"CSF3R:NM_000760:exon10:c.T1254C:p.R418R,CSF3R:...",.,.,0.2997,249886,not_specified,MedGen:CN169374,"criteria_provided,_multiple_submitters,_no_con...",Benign,0.3126,0.4064,0.3242,0.2988,0.3127,0.1963,0.4064,0.2555,0.1011,0.3453,0.3427,0.3554,0.3251,0.4064,0.4064,0.4062,0.4103,0.5,.,57,chr1,36471464,.,A,G,.,PASS,ADP=57;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:108:58:57:28:29:50.88%:1.4722E-11:35:34:12...,RO10001
2,chr2,25234374,25234374,G,T,exonic,DNMT3A,.,nonsynonymous SNV,"DNMT3A:NM_001320893:exon18:c.C2188A:p.R730S,DN...",.,ID=COSM87001;OCCURENCE=18(haematopoietic_and_l...,.,362763,Acute_myeloid_leukemia,"Human_Phenotype_Ontology:HP:0004808,MeSH:D0154...",no_assertion_criteria_provided,Pathogenic,7.964e-06,8.801e-06,7.366e-06,8.667e-06,1.193e-05,0.0,0.0,0.0,0.0,8.801e-06,0.0,9.927e-05,0.0,8.966e-06,1.118e-05,9.742e-06,.,0.5,.,51,chr2,25234374,.,G,T,.,PASS,ADP=51;WT=0;HET=1;HOM=0;NC=0;MDS_AC=11;TCGA_AC=7,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:101:51:51:24:27:52.94%:6.451E-11:34:35:10:...,RO10001
3,chr2,25246633,25246633,C,T,exonic,DNMT3A,.,synonymous SNV,"DNMT3A:NM_001320893:exon5:c.G810A:p.L270L,DNMT...",.,ID=COSM4001621;OCCURENCE=1(thyroid),0.2049,.,.,.,.,.,0.19,0.3155,0.1828,0.1985,0.1896,0.2952,0.1172,0.2067,0.3155,0.1642,0.2159,0.1961,0.1859,0.3155,0.3147,0.3149,0.3171,0.5,.,66,chr2,25246633,.,C,T,.,PASS,ADP=66;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:142:66:66:29:37:56.06%:5.6872E-15:33:33:14...,RO10001
4,chr2,197393071,197393071,T,C,exonic,SF3B1,.,synonymous SNV,SF3B1:NM_012433:exon24:c.A3657G:p.V1219V,.,ID=COSM3757859;OCCURENCE=1(large_intestine),0.7360,.,.,.,.,.,0.6584,0.8405,0.6697,0.6451,0.6583,0.8405,0.7555,0.4958,0.5326,0.681,0.5766,0.7534,0.6982,0.8415,0.8405,0.8417,0.8436,0.5,.,75,chr2,197393071,.,T,C,.,PASS,ADP=75;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:132:75:75:39:36:48%:5.4147E-14:33:31:15:24...,RO10001


In [30]:
combined_df.to_csv('Combined_Annovar_Chromoseq_annotation.csv', header=True, index=False)

## Make Chromoseq input Json file

In [31]:
import json as js

path = "/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/"
table = pd.DataFrame(columns=["Cram", "CramIndex", "Name"])

files = os.listdir("X:"+path)
files = [i.split('.')[0] for i in files]
files = list(set(files))

for sample in files:
    SpecimenId = sample.split('_')[0]
    table.loc[len(table)] = [str("/fh"+path+sample+".unmapped.recal.bam"), str("/fh"+path+sample+".unmapped.recal.bai"), str(sample) ]

table.head()

Unnamed: 0,Cram,CramIndex,Name
0,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO20038_JR-WGS_230626_A00613_0563_BHNJWHDSX5
1,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO50884_JR-WGS_230626_A00613_0563_BHNJWHDSX5
2,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO50870_JR-WGS_230626_A00613_0562_AHNJWFDSX5
3,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO20030_JR-WGS_230626_A00613_0563_BHNJWHDSX5
4,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO20125_JR-WGS_230626_A00613_0562_AHNJWFDSX5


In [66]:
json_string = js.dumps(table.to_dict(orient='records'), indent=4)
print(json_string)

[
    {
        "Cram": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO20038_JR-WGS_230626_A00613_0563_BHNJWHDSX5.bam",
        "CramIndex": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO20038_JR-WGS_230626_A00613_0563_BHNJWHDSX5.bai",
        "Name": "RO20038_JR-WGS_230626_A00613_0563_BHNJWHDSX5"
    },
    {
        "Cram": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO50884_JR-WGS_230626_A00613_0563_BHNJWHDSX5.bam",
        "CramIndex": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO50884_JR-WGS_230626_A00613_0563_BHNJWHDSX5.bai",
        "Name": "RO50884_JR-WGS_230626_A00613_0563_BHNJWHDSX5"
    },
    {
        "Cram": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO50870_JR-WGS_230626_A00613_0562_AHNJWFDSX5.bam",
        "CramIndex": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO50870_JR-WGS_230626_A00613_0562_AHNJWFDSX5.bai",
        "

## Make VCReady WDL JSON

In [95]:
os.listdir('CramToFastqToVCReady')

['CRAMToVCReady.wdl', 'cram_files.txt', 'inputs_small.json', 'options.json']

In [31]:
with open('s3_cram_files_path.txt', 'r') as f:
    data = f.read().splitlines()
data =[i.strip(',').strip('\"') for i in data]

In [97]:
df = pd.DataFrame(data)
split_columns = df[0].str.split('/', expand=True)
for i in range(len(split_columns.columns)):
    df[f'split_{i}'] = split_columns[i]

In [98]:
result_dict = df.groupby("split_7")[0].agg(list).to_dict()
json_result = json.dumps(result_dict, indent=4)

In [94]:
print(json_result)

{
    "MJW922010": [
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/MJW922010/MJW_9_2_2010_JR-WGS_230626_A00613_0562_AHNJWFDSX5.unmapped.cram",
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/MJW922010/MJW_9_2_2010_JR-WGS_230626_A00613_0563_BHNJWHDSX5.unmapped.cram",
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/MJW922010/MJW_9_2_2010_JR-WGS_230629_A00613_0565_AHNJY5DSX5.unmapped.cram",
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/MJW922010/MJW_9_2_2010_JR-WGS_230705_A00613_0568_BHW3JTDSX5.unmapped.cram"
    ],
    "RO10001": [
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/RO10001/RO10001_JR-WGS_230626_A00613_0562_AHNJWFDSX5.unmapped.cram",
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b

In [75]:
for k, val in gp:
    val = val[8]
    break

In [78]:
result_dict

{('MJW922010',): 0    MJW_9_2_2010_JR-WGS_230626_A00613_0562_AHNJWFD...
 1    MJW_9_2_2010_JR-WGS_230626_A00613_0563_BHNJWHD...
 2    MJW_9_2_2010_JR-WGS_230629_A00613_0565_AHNJY5D...
 3    MJW_9_2_2010_JR-WGS_230705_A00613_0568_BHW3JTD...
 Name: 8, dtype: object,
 ('RO10001',): 4    RO10001_JR-WGS_230626_A00613_0562_AHNJWFDSX5.u...
 5    RO10001_JR-WGS_230626_A00613_0563_BHNJWHDSX5.u...
 6    RO10001_JR-WGS_230629_A00613_0565_AHNJY5DSX5.u...
 7    RO10001_JR-WGS_230705_A00613_0568_BHW3JTDSX5.u...
 Name: 8, dtype: object,
 ('RO20030',): 8     RO20030_JR-WGS_230626_A00613_0562_AHNJWFDSX5.u...
 9     RO20030_JR-WGS_230626_A00613_0563_BHNJWHDSX5.u...
 10    RO20030_JR-WGS_230629_A00613_0565_AHNJY5DSX5.u...
 11    RO20030_JR-WGS_230705_A00613_0568_BHW3JTDSX5.u...
 Name: 8, dtype: object,
 ('RO20033',): 12    RO20033_JR-WGS_230626_A00613_0562_AHNJWFDSX5.u...
 13    RO20033_JR-WGS_230626_A00613_0563_BHNJWHDSX5.u...
 14    RO20033_JR-WGS_230629_A00613_0565_AHNJY5DSX5.u...
 Name: 8, dtype: obj