In [2]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from IPython.display import display, HTML
import json

## Reading Clinical Annotation

In [3]:
df = pd.read_csv('04.14.csv', low_memory=False)
df = df.loc[~df.isnull().all(axis=1)]

gene_cols = range(30,len(df.columns))
gene_names = df.columns[gene_cols].tolist()
gene_names.extend(["DNMT3A", "CEBPA", "FLT3", "NPM1"])

In [4]:
gene_names

['ABL1',
 'ASXL1',
 'ATRX',
 'BCORL1',
 'BCOR',
 'BRAF',
 'CALR',
 'CBLB',
 'CBLC',
 'CBL',
 'CDKN2A',
 'CEBPAResult',
 'CSF3R',
 'CUX1',
 'CXCR4',
 'DDX41',
 'DNMT3AResult',
 'ETNK1',
 'EZH2',
 'FBXW7',
 'FGFR1',
 'FLT3-ITD',
 'FLT3TKD',
 'GATA1',
 'GATA2',
 'HRAS',
 'IDH1',
 'IDH2',
 'JAK',
 'KDM6A',
 'KIT',
 'KMT2A',
 'KRAS',
 'MAP2K1',
 'MPL',
 'MYD88',
 'NF1',
 'NOTCH1',
 'NPM1Result',
 'NRAS',
 'PDGFRA',
 'PDGFRB',
 'PHF6',
 'PPM1D',
 'PTEN',
 'PTPN11',
 'RAD21',
 'RB1',
 'RUNX1',
 'SAMD9L',
 'SAMD9',
 'SETBP1',
 'SF3B1',
 'SH2B3',
 'SMC1A',
 'SMC3',
 'SRSF2',
 'STAG2',
 'STAT3',
 'STAT5B',
 'TET2',
 'TP53',
 'U2AF1',
 'U2AF2',
 'WT1',
 'ZRSR2',
 'DNMT3A',
 'CEBPA',
 'FLT3',
 'NPM1']

## Reading Annovar Output Files


In [10]:
samples = os.listdir('output_chromoseq')

combined_df = pd.DataFrame()

for sample in samples:
    df = pd.read_csv(f'output_chromoseq/{sample}/{sample}.combined_variants.hg38_multianno.txt', sep='\t')
    df['sampleID'] = sample
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# filter based on interested genes
combined_df = combined_df.loc[combined_df["Gene.refGene"].isin(gene_names)]

# put sampleID as first column
cols = combined_df.columns.tolist()
combined_df = combined_df[[cols[-1]] + cols[:-1]]


In [11]:
combined_df['sampleID'].value_counts()

sampleID
RO20033    28
RO10001    25
RO51053    24
RO51061    24
RO50990    17
Name: count, dtype: int64

In [13]:
combined_df.to_csv('Combined_Annovar_Chromoseq_annotation.csv', header=True, index=False)

## Make Chromoseq input Json file

In [31]:
import json as js

path = "/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/"
table = pd.DataFrame(columns=["Cram", "CramIndex", "Name"])

files = os.listdir("X:"+path)
files = [i.split('.')[0] for i in files]
files = list(set(files))

for sample in files:
    SpecimenId = sample.split('_')[0]
    table.loc[len(table)] = [str("/fh"+path+sample+".unmapped.recal.bam"), str("/fh"+path+sample+".unmapped.recal.bai"), str(sample) ]

table.head()

Unnamed: 0,Cram,CramIndex,Name
0,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO20038_JR-WGS_230626_A00613_0563_BHNJWHDSX5
1,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO50884_JR-WGS_230626_A00613_0563_BHNJWHDSX5
2,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO50870_JR-WGS_230626_A00613_0562_AHNJWFDSX5
3,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO20030_JR-WGS_230626_A00613_0563_BHNJWHDSX5
4,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,/fh/scratch/delete90/paguirigan_a/sanaz/varian...,RO20125_JR-WGS_230626_A00613_0562_AHNJWFDSX5


In [66]:
json_string = js.dumps(table.to_dict(orient='records'), indent=4)
print(json_string)

[
    {
        "Cram": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO20038_JR-WGS_230626_A00613_0563_BHNJWHDSX5.bam",
        "CramIndex": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO20038_JR-WGS_230626_A00613_0563_BHNJWHDSX5.bai",
        "Name": "RO20038_JR-WGS_230626_A00613_0563_BHNJWHDSX5"
    },
    {
        "Cram": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO50884_JR-WGS_230626_A00613_0563_BHNJWHDSX5.bam",
        "CramIndex": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO50884_JR-WGS_230626_A00613_0563_BHNJWHDSX5.bai",
        "Name": "RO50884_JR-WGS_230626_A00613_0563_BHNJWHDSX5"
    },
    {
        "Cram": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO50870_JR-WGS_230626_A00613_0562_AHNJWFDSX5.bam",
        "CramIndex": "/fh/scratch/delete90/paguirigan_a/sanaz/variant_calling/aligned_bam/RO50870_JR-WGS_230626_A00613_0562_AHNJWFDSX5.bai",
        "

## Make prepareBAM WDL JSON

In [95]:
os.listdir('CramToFastqToVCReady')

['CRAMToVCReady.wdl', 'cram_files.txt', 'inputs_small.json', 'options.json']

In [31]:
with open('s3_cram_files_path.txt', 'r') as f:
    data = f.read().splitlines()
data =[i.strip(',').strip('\"') for i in data]

In [97]:
df = pd.DataFrame(data)
split_columns = df[0].str.split('/', expand=True)
for i in range(len(split_columns.columns)):
    df[f'split_{i}'] = split_columns[i]

In [98]:
result_dict = df.groupby("split_7")[0].agg(list).to_dict()
json_result = json.dumps(result_dict, indent=4)

In [94]:
print(json_result)

{
    "MJW922010": [
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/MJW922010/MJW_9_2_2010_JR-WGS_230626_A00613_0562_AHNJWFDSX5.unmapped.cram",
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/MJW922010/MJW_9_2_2010_JR-WGS_230626_A00613_0563_BHNJWHDSX5.unmapped.cram",
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/MJW922010/MJW_9_2_2010_JR-WGS_230629_A00613_0565_AHNJY5DSX5.unmapped.cram",
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/MJW922010/MJW_9_2_2010_JR-WGS_230705_A00613_0568_BHW3JTDSX5.unmapped.cram"
    ],
    "RO10001": [
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b47c763-0623-4cf4-bc85-1ee6625e097f/individualRGCrams/RO10001/RO10001_JR-WGS_230626_A00613_0562_AHNJWFDSX5.unmapped.cram",
        "s3://fh-pi-paguirigan-a-eco/preconcWGS/JR/6b