In [5]:
import pysam
import pandas as pd
from collections import OrderedDict
from celescope.tools import utils

In [10]:
AA_DICT = {
    'Gly' : 'G',
    'Ala' : 'A',
    'Val' : 'V',
    'Leu' : 'L',
    'Ile' : 'I',
    'Phe' : 'F',
    'Trp' : 'W',
    'Tyr' : 'Y',
    'Asp' : 'D',
    'Asn' : 'N',
    'Glu' : 'E',
    'Lys' : 'K',
    'Gln' : 'Q',
    'Met' : 'M',
    'Ser' : 'S',
    'Thr' : 'T',
    'Cys' : 'C',
    'Pro' : 'P',
    'His' : 'H',
    'Arg' : 'R',
}

In [6]:
def parse_vcf_to_df(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID', 'CID')):
    """
    Read cols and infos into pandas df
    """
    vcf = pysam.VariantFile(vcf_file)
    df = pd.DataFrame(columns=[col.capitalize() for col in cols] + infos)
    rec_dict = {}
    for rec in vcf.fetch():

        for col in cols:
            rec_dict[col.capitalize()] = getattr(rec, col)
            if col == 'alleles':
                rec_dict['Alleles'] = '-'.join(rec_dict['Alleles'])

        for info in infos:
            rec_dict[info] = rec.info[info]

        '''
        rec_dict['GT'] = [s['GT'] for s in rec.samples.values()][0]
        rec_dict['GT'] = [str(item) for item in rec_dict['GT']]
        rec_dict['GT'] = '/'.join(rec_dict['GT'])
        '''
        df_new = pd.DataFrame(rec_dict, index=[0])
        df = pd.concat([df, df_new])

    vcf.close()
    df.reset_index(drop=True, inplace=True)
    return df

In [16]:
def parse_variant_ann(variant_ann_file):
    """
    Args:
        variant_ann_file: variant annotation file from snpEff.
        n_entry: number of entries to read. If None, read all. Avoid extra lines like "NOTICE: Among 555 different variants..."
    """
    gene_list = []
    mRNA_list = []
    protein_list = []
    with open(variant_ann_file) as f:
        for line in f.readlines():
            if not line.startswith("#"):
                info = line.split('\t')[7]
                anns = info.split("|")
                gene = anns[3]
                gene_list.append(gene)
            
                tmp1, tmp2 = [], []
                for ann in anns:
                    if ann.startswith("c."):
                        exon_loc = anns[anns.index(ann) - 1].split('/')[0]
                        exon = ann.strip("c.")
                        exon = f"exon{exon_loc}:{exon}"
                        if exon not in tmp1:
                            tmp1.append(exon)

                    if ann.startswith("p."):
                        protein = ann.strip("p.")
                        for i in AA_DICT:
                            protein = protein.replace(i, AA_DICT[i])
                        if protein not in tmp2:
                            tmp2.append(protein)
                        
                mRNA_list.append(','.join(tmp1))
                protein_list.append(','.join(tmp2))

    return (gene_list, mRNA_list, protein_list)

In [4]:
df_vcf = parse_vcf_to_df("/SGRNJ06/randd/USER/cjj/celedev/snp/20221230/CS_1/08.filter_snp/CS_1_filtered.vcf", infos=[])

In [8]:
df_vcf

Unnamed: 0,Chrom,Pos,Alleles
0,11,32388003,AACACAC-A
1,11,32388811,T-A
2,11,32388834,T-C
3,11,32388970,T-C
4,11,32396284,A-G
...,...,...,...
1327,9,130874972,C-A
1328,9,130874973,T-C
1329,9,130875244,C-T
1330,9,130884037,G-A


In [17]:
df_anno = parse_variant_ann("/SGRNJ06/randd/USER/cjj/celedev/snp/20221230/CS_1/09.analysis_snp/snpEff/variants_ann.vcf")
df_anno

In [21]:
    with open("/SGRNJ06/randd/USER/cjj/celedev/snp/20221230/CS_1/09.analysis_snp/snpEff/variants_ann.vcf") as f:
        for line in f.readlines():
            print(line)
            #if not line.startswith("#"):

In [27]:
f = open("/SGRNJ06/randd/USER/cjj/celedev/snp/20221230/CS_1/09.analysis_snp/snpEff/variants_ann.vcf",'r')
while True:
    line = f.readline().strip()
    print(line)

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##bcftoolsVersion=1.12+htslib-1.12
##bcftoolsCommand=mpileup -f /SGRNJ03/randd/public/genome/rna/hs_ensembl_99/Homo_sapiens.GRCh38.dna.primary_assembly.fa --threads 4 --annotate DP,AD -d 100000000 -o .//test1/07.variant_calling/test1_raw.bcf --regions-file /SGRNJ06/randd/USER/cjj/Celescope/refactor_trust/CeleScope/celescope/data/snp/panel/lung_1.bed .//test1/07.variant_calling/test1_splitN.bam
##reference=file:///SGRNJ03/randd/public/genome/rna/hs_ensembl_99/Homo_sapiens.GRCh38.dna.primary_assembly.fa
##contig=<ID=1,length=248956422>
##contig=<ID=10,length=133797422>
##contig=<ID=11,length=135086622>
##contig=<ID=12,length=133275309>
##contig=<ID=13,length=114364328>
##contig=<ID=14,length=107043718>
##contig=<ID=15,length=101991189>
##contig=<ID=16,length=90338345>
##contig=<ID=17,length=83257441>
##contig=<ID=18,length=80373285>
##contig=<ID=19,length=58617616>
##contig=<ID=2,length=242193529>
##contig=<ID=20,le

KeyboardInterrupt: 

In [12]:
df_anno = parse_variant_ann("/SGRNJ06/randd/USER/cjj/snpeff/20221205/test1/09.analysis_snp/snpEff/variants_ann.vcf")

In [13]:
df_anno

(['PIK3CA',
  'EGFR',
  'EGFR',
  'EGFR-AS1',
  'EGFR-AS1',
  'EGFR',
  'EGFR',
  'EGFR',
  'BRAF',
  'BRAF',
  'BRAF',
  'BRAF'],
 ['exon10:1564G>A',
  'exon18:2134T>C,exon17:1999T>C',
  'exon18:2141dupA,exon17:2006dupA',
  '',
  '',
  'exon20:2361G>A,exon19:2226G>A',
  'exon20:2369C>T,exon19:2234C>T',
  'exon21:2573T>G,exon20:2438T>G',
  'exon16:1939T>G,exon15:1819T>G',
  'exon16:1929A>T,exon15:1809A>T',
  'exon16:1922A>T,exon15:1802A>T',
  'exon16:1914T>G,exon15:1794T>G'],
 ['E522K',
  'F712L,F667L',
  'I715fs,I670fs',
  '',
  '',
  'Q787Q,Q742Q',
  'T790M,T745M',
  'L858R,L813R',
  'S647A,S607A',
  'R643R,R603R',
  'K641I,K601I',
  'A638A,A598A'])

In [250]:
df_vcf["Gene"], df_vcf["mRNA"], df_vcf["Protein"] = df_anno[0], df_anno[1], df_anno[2]

In [251]:
df_vcf

Unnamed: 0,Chrom,Pos,Alleles,Gene,mRNA,Protein
0,3,179218234,G-A,PIK3CA,exon10:1564G>A,E522K
1,7,55173993,T-C,EGFR,"exon18:2134T>C,exon17:1999T>C","F712L,F667L"
2,7,55173995,C-CA,EGFR,"exon18:2141dupA,exon17:2006dupA","I715fs,I670fs"
3,7,55181290,C-G,EGFR-AS1,,
4,7,55181292,G-T,EGFR-AS1,,
5,7,55181370,G-A,EGFR,"exon20:2361G>A,exon19:2226G>A","Q787Q,Q742Q"
6,7,55181378,C-T,EGFR,"exon20:2369C>T,exon19:2234C>T","T790M,T745M"
7,7,55191822,T-G,EGFR,"exon21:2573T>G,exon20:2438T>G","L858R,L813R"
8,7,140753316,A-C,BRAF,"exon16:1939T>G,exon15:1819T>G","S647A,S607A"
9,7,140753326,T-A,BRAF,"exon16:1929A>T,exon15:1809A>T","R643R,R603R"


In [252]:
df_ncell = pd.read_csv("/SGRNJ06/randd/USER/cjj/snpeff/20221205/test1/09.analysis_snp/test1_variant_ncell.csv")
df_vcf = pd.concat([df_vcf, df_ncell], axis=1)

In [253]:
df_vcf

Unnamed: 0.1,Chrom,Pos,Alleles,Gene,mRNA,Protein,Unnamed: 0,0/0,0/1,1/1
0,3,179218234,G-A,PIK3CA,exon10:1564G>A,E522K,3_179218234,1,0,1
1,7,55173993,T-C,EGFR,"exon18:2134T>C,exon17:1999T>C","F712L,F667L",7_55173993,3,0,1
2,7,55173995,C-CA,EGFR,"exon18:2141dupA,exon17:2006dupA","I715fs,I670fs",7_55173995,2,0,1
3,7,55181290,C-G,EGFR-AS1,,,7_55181290,0,0,2
4,7,55181292,G-T,EGFR-AS1,,,7_55181292,0,0,1
5,7,55181370,G-A,EGFR,"exon20:2361G>A,exon19:2226G>A","Q787Q,Q742Q",7_55181370,0,1,4
6,7,55181378,C-T,EGFR,"exon20:2369C>T,exon19:2234C>T","T790M,T745M",7_55181378,0,1,4
7,7,55191822,T-G,EGFR,"exon21:2573T>G,exon20:2438T>G","L858R,L813R",7_55191822,1,0,4
8,7,140753316,A-C,BRAF,"exon16:1939T>G,exon15:1819T>G","S647A,S607A",7_140753316,1,0,1
9,7,140753326,T-A,BRAF,"exon16:1929A>T,exon15:1809A>T","R643R,R603R",7_140753326,1,0,1


In [254]:
gene_list = utils.get_gene_region_from_bed("lung_1")[0]

In [255]:
gene_list

{'BRAF', 'EGFR', 'HARS', 'KRAS', 'NRAS', 'PIK3CA'}

In [241]:
        cols = ["Chrom", "Pos", "Alleles", "Gene", "0/0", "0/1", "1/1", "mRNA", "Protein"]
        df_vcf = df_vcf[cols]
        df_vcf = df_vcf[df_vcf.Gene.isin(gene_list)]
        variant_table = df_vcf

In [256]:
variant_table

Unnamed: 0,Chrom,Pos,Alleles,Gene,0/0,0/1,1/1,mRNA,Protein
0,3,179218234,G-A,PIK3CA,1,0,1,exon10:1564G>A,E522K
1,7,55173993,T-C,EGFR,3,0,1,"exon18:2134T>C,exon17:1999T>C","F712L,F667L"
2,7,55173995,C-CA,EGFR,2,0,1,"exon18:2141dupA,exon17:2006dupA","I715fs,I670fs"
5,7,55181370,G-A,EGFR,0,1,4,"exon20:2361G>A,exon19:2226G>A","Q787Q,Q742Q"
6,7,55181378,C-T,EGFR,0,1,4,"exon20:2369C>T,exon19:2234C>T","T790M,T745M"
7,7,55191822,T-G,EGFR,1,0,4,"exon21:2573T>G,exon20:2438T>G","L858R,L813R"
8,7,140753316,A-C,BRAF,1,0,1,"exon16:1939T>G,exon15:1819T>G","S647A,S607A"
9,7,140753326,T-A,BRAF,1,0,1,"exon16:1929A>T,exon15:1809A>T","R643R,R603R"
10,7,140753333,T-A,BRAF,1,0,1,"exon16:1922A>T,exon15:1802A>T","K641I,K601I"
11,7,140753341,A-C,BRAF,1,0,1,"exon16:1914T>G,exon15:1794T>G","A638A,A598A"


In [259]:
variant_table.reset_index(drop=True, inplace=True)

In [218]:
variant_table["detail"] = "None"

In [263]:
variant_table

Unnamed: 0,Chrom,Pos,Alleles,Gene,0/0,0/1,1/1,mRNA,Protein
0,3,179218234,G-A,PIK3CA,1,0,1,exon10:1564G>A,E522K
1,7,55173993,T-C,EGFR,3,0,1,"exon18:2134T>C,exon17:1999T>C","F712L,F667L"
2,7,55173995,C-CA,EGFR,2,0,1,"exon18:2141dupA,exon17:2006dupA","I715fs,I670fs"
3,7,55181370,G-A,EGFR,0,1,4,"exon20:2361G>A,exon19:2226G>A","Q787Q,Q742Q"
4,7,55181378,C-T,EGFR,0,1,4,"exon20:2369C>T,exon19:2234C>T","T790M,T745M"
5,7,55191822,T-G,EGFR,1,0,4,"exon21:2573T>G,exon20:2438T>G","L858R,L813R"
6,7,140753316,A-C,BRAF,1,0,1,"exon16:1939T>G,exon15:1819T>G","S647A,S607A"
7,7,140753326,T-A,BRAF,1,0,1,"exon16:1929A>T,exon15:1809A>T","R643R,R603R"
8,7,140753333,T-A,BRAF,1,0,1,"exon16:1922A>T,exon15:1802A>T","K641I,K601I"
9,7,140753341,A-C,BRAF,1,0,1,"exon16:1914T>G,exon15:1794T>G","A638A,A598A"


In [None]:
def parse_variant_ann(variant_ann_file, n_entry=None):
    """
    Args:
        variant_ann_file: variant annotation file from snpEff.
        n_entry: number of entries to read. If None, read all. Avoid extra lines like "NOTICE: Among 555 different variants..."
    """
    gene_list = []
    mRNA_list = []
    protein_list = []
    df = pd.DataFrame(columns=['Gene', 'mRNA', 'Protein', 'COSMIC'])
    with open(variant_ann_file) as f:
        for line in f.readlines():
            if not line.startswith("#"):
                info = line.split('\t')[7]
                anns = info.split("|")
                gene = anns[3]
                gene_list.append(gene)
            
            l1, l2 = [], []
            for ann in anns:
                if ann.startswith("c."):
                    exon_loc = anns[anns.index(ann) - 1].split('/')[0]
                    exon = ann.strip("c.")
                    exon = f"exon{exon_loc}:{exon}"
                    l1.append(exon)

                if ann.startswith("p."):
                    protein = ann.strip("p.")
                    l2.append(protein)

            mRNA_list.append(','.join(l1))
            protein_list.append(','.join(l2))

    return (gene_list, mRNA_list, protein_list)

In [None]:
def parse_annovar(annovar_file, n_entry=None):
    """
    Args:
        n_entry: number of entries to read. If None, read all. Avoid extra lines like "NOTICE: Among 555 different variants..."
    """
    df = pd.DataFrame(columns=['Gene', 'mRNA', 'Protein', 'COSMIC'])
    with open(annovar_file, 'rt') as f:
        index = 0
        for line in f:
            index += 1
            if index == 1:
                continue
            attrs = line.split('\t')
            gene = attrs[6]
            func = attrs[5]
            if func == 'exonic':
                changes = attrs[9]
                cosmic = attrs[10]
                split_char = ','
            else:
                changes = attrs[7]
                cosmic = attrs[8]
                split_char = ';'
            change_list = list()
            for change in changes.split(split_char):
                change_attrs = change.split(':')
                mRNA = ''
                protein = ''
                for change_index in range(len(change_attrs)):
                    change_attr = change_attrs[change_index]
                    if change_attr.startswith('c.'):
                        base = change_attr.strip('c.')
                        exon = change_attrs[change_index - 1]
                        mRNA = f'{exon}:{base}'
                    if change_attr.startswith('p.'):
                        protein = change_attr.strip('p.')
                if not (mRNA, protein) in change_list:
                    change_list.append((mRNA, protein))
            combine = [','.join(item) for item in list(zip(*change_list))]
            mRNA = combine[0]
            protein = combine[1]
            df_new = pd.DataFrame([[gene, mRNA, protein, cosmic]], columns=['Gene', 'mRNA', 'Protein', 'COSMIC'], index=[0])
            df = pd.concat([df, df_new])
    df.reset_index(drop=True, inplace=True)
    if n_entry:
        df = df.head(n_entry)

    return df

In [176]:
AA_DICT = {
    'Gly' : 'G',
    'Ala' : 'A',
    'Val' : 'V',
    'Leu' : 'L',
    'Ile' : 'I',
    'Phe' : 'F',
    'Trp' : 'W',
    'Tyr' : 'Y',
    'Asp' : 'D',
    'Asn' : 'N',
    'Glu' : 'E',
    'Lys' : 'K',
    'Gln' : 'Q',
    'Met' : 'M',
    'Ser' : 'S',
    'Thr' : 'T',
    'Cys' : 'C',
    'Pro' : 'P',
    'His' : 'H',
    'Arg' : 'R',
}

In [223]:
    gene_list = []
    mRNA_list = []
    protein_list = []
    with open("/SGRNJ06/randd/USER/cjj/snpeff/20221205/test1/08.filter_snp/test2.vcf") as f:
        for line in f.readlines():
            if not line.startswith("#"):
                info = line.split('\t')[7]
                anns = info.split("|")
                gene = anns[3]
                gene_list.append(gene)
            
                print(info)


DP=144;VDB=0;SGB=99.7196;RPB=0.998513;MQB=1;BQB=0.994064;MQ0F=0;AC=2;AN=4;DP4=5,0,139,0;MQ=60;REF_T=10;ALT_T=46;ANN=A|missense_variant|MODERATE|PIK3CA|ENSG00000121879|transcript|ENST00000263967.4|protein_coding|10/21|c.1564G>A|p.Glu522Lys|1887/9259|1564/3207|522/1068||,A|missense_variant|MODERATE|PIK3CA|ENSG00000121879|transcript|ENST00000643187.1|protein_coding|10/22|c.1564G>A|p.Glu522Lys|1670/4130|1564/3003|522/1000||
DP=443;VDB=0.000411984;SGB=1.31889;RPB=0.999946;MQB=1;BQB=0.886634;MQ0F=0;AC=1;AN=10;DP4=403,0,6,0;MQ=60;REF_T=55;ALT_T=10;ANN=C|missense_variant|MODERATE|EGFR|ENSG00000146648|transcript|ENST00000275493.7|protein_coding|18/28|c.2134T>C|p.Phe712Leu|2395/9905|2134/3633|712/1210||,C|missense_variant|MODERATE|EGFR|ENSG00000146648|transcript|ENST00000455089.5|protein_coding|17/26|c.1999T>C|p.Phe667Leu|2256/3844|1999/3276|667/1091||,C|missense_variant|MODERATE|EGFR|ENSG00000146648|transcript|ENST00000454757.6|protein_coding|17/27|c.1999T>C|p.Phe667Leu|2245/5464|1999/3498|667/

In [None]:
df_vcf = parse_vcf_to_df("/SGRNJ06/randd/USER/cjj/TESTDATA/git/celescope_test_script/snp/test1/08.filter_snp/test1_filtered.vcf", infos=[])

In [None]:
df_vcf

In [None]:
df_annovar = parse_annovar("/SGRNJ06/randd/USER/cjj/TESTDATA/git/celescope_test_script/snp/test1/09.analysis_snp/annovar/test1.hg38_multianno.txt", n_entry=df_vcf.shape[0])

In [None]:
df_annovar

In [None]:
df_vcf = pd.concat((df_vcf, df_annovar), axis=1)

In [None]:
df_vcf

In [None]:
df_ncell = pd.read_csv("/SGRNJ06/randd/USER/cjj/TESTDATA/git/celescope_test_script/snp/test1/09.analysis_snp/test1_variant_ncell.csv")
df_vcf = pd.concat([df_vcf, df_ncell], axis=1)

In [None]:
df_vcf.Protein

In [4]:
    with open("/SGRNJ06/randd/USER/cjj/snpeff/20221205/test1/09.analysis_snp/snpEff/variants_ann.vcf") as f:
        for line in f.readlines():
            if not line.startswith("#"):
                info = line.split('\t')[7]
                anns = info.split("|")
                gene = anns[3]
                print(anns)
                break

['DP=144;VDB=0;SGB=99.7196;RPB=0.998513;MQB=1;BQB=0.994064;MQ0F=0;AC=2;AN=4;DP4=5,0,139,0;MQ=60;REF_T=10;ALT_T=46;ANN=A', 'missense_variant', 'MODERATE', 'PIK3CA', 'ENSG00000121879', 'transcript', 'ENST00000263967.4', 'protein_coding', '10/21', 'c.1564G>A', 'p.Glu522Lys', '1887/9259', '1564/3207', '522/1068', '', ',A', 'missense_variant', 'MODERATE', 'PIK3CA', 'ENSG00000121879', 'transcript', 'ENST00000643187.1', 'protein_coding', '10/22', 'c.1564G>A', 'p.Glu522Lys', '1670/4130', '1564/3003', '522/1000', '', '']
