In [5]:
import pysam
import pandas as pd
import re 

In [6]:
def parse_vcf_to_df(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID', 'CID')):
    """
    Read cols and infos into pandas df
    """
    vcf = pysam.VariantFile(vcf_file)
    df = pd.DataFrame(columns=[col.capitalize() for col in cols] + infos)
    rec_dict = {}
    for rec in vcf.fetch():

        for col in cols:
            rec_dict[col.capitalize()] = getattr(rec, col)
            if col == 'alleles':
                rec_dict['Alleles'] = '-'.join(rec_dict['Alleles'])

        for info in infos:
            rec_dict[info] = rec.info[info]

        '''
        rec_dict['GT'] = [s['GT'] for s in rec.samples.values()][0]
        rec_dict['GT'] = [str(item) for item in rec_dict['GT']]
        rec_dict['GT'] = '/'.join(rec_dict['GT'])
        '''
        df_new = pd.DataFrame(rec_dict, index=[0])
        df = pd.concat([df, df_new])

    vcf.close()
    df.reset_index(drop=True, inplace=True)
    return df

In [7]:
df_vcf = parse_vcf_to_df("/SGRNJ06/randd/USER/cjj/TESTDATA/git/celescope_test_script/snp/test1/08.filter_snp/test1_filtered.vcf", infos=[])

In [8]:
df_vcf

Unnamed: 0,Chrom,Pos,Alleles
0,3,179218234,G-A
1,7,55173993,T-C
2,7,55173995,C-CA
3,7,55181290,C-G
4,7,55181292,G-T
5,7,55181370,G-A
6,7,55181378,C-T
7,7,55191822,T-G
8,7,140753316,A-C
9,7,140753326,T-A


In [9]:
df_ncell = pd.read_csv("/SGRNJ06/randd/USER/cjj/TESTDATA/git/celescope_test_script/snp/test1/09.analysis_snp/test1_variant_ncell.csv")
df_vcf = pd.concat([df_vcf, df_ncell], axis=1)

In [10]:
df_vcf

Unnamed: 0.1,Chrom,Pos,Alleles,Unnamed: 0,0/0,0/1,1/1
0,3,179218234,G-A,3_179218234,1,0,1
1,7,55173993,T-C,7_55173993,3,0,1
2,7,55173995,C-CA,7_55173995,2,0,1
3,7,55181290,C-G,7_55181290,0,0,2
4,7,55181292,G-T,7_55181292,0,0,1
5,7,55181370,G-A,7_55181370,0,1,4
6,7,55181378,C-T,7_55181378,0,1,4
7,7,55191822,T-G,7_55191822,1,0,4
8,7,140753316,A-C,7_140753316,1,0,1
9,7,140753326,T-A,7_140753326,1,0,1


In [13]:
info_list = []
with open("/SGRNJ06/randd/USER/cjj/TESTDATA/git/celescope_test_script/snp/test1/08.filter_snp/GRCh99/ann.vcf") as f:
    for line in f.readlines():
        if not line.startswith("#"):
            info = line.split('\t')[7]
            #info_list.append(info)
            #ann_pattern = re.match(r'ANN= "(\S+)"')
            anns = info.split("|")
            #print(anns)
            gene = anns[3]
            print(gene)
            for ann in anns:
                if ann.startswith("p."):
                    protein = ann.strip("p.")
                    print(protein)
                if ann.startswith("c."):
                    exon = ann.strip("c.")
                    print(exon)
                
            print("=======================================")

['DP=144;VDB=0;SGB=99.7196;RPB=0.998513;MQB=1;BQB=0.994064;MQ0F=0;AC=2;AN=4;DP4=5,0,139,0;MQ=60;REF_T=10;ALT_T=46;ANN=A', 'missense_variant', 'MODERATE', 'PIK3CA', 'ENSG00000121879', 'transcript', 'ENST00000263967.3', 'protein_coding', '10/21', 'c.1564G>A', 'p.Glu522Lys', '1721/9093', '1564/3207', '522/1068', '', ',A', 'upstream_gene_variant', 'MODIFIER', 'PIK3CA', 'ENSG00000121879', 'transcript', 'ENST00000462255.1', 'retained_intron', '', 'n.-1317G>A', '', '', '', '', '1317', '']
PIK3CA
1564G>A
Glu522Lys
['DP=443;VDB=0.000411984;SGB=1.31889;RPB=0.999946;MQB=1;BQB=0.886634;MQ0F=0;AC=1;AN=10;DP4=403,0,6,0;MQ=60;REF_T=55;ALT_T=10;ANN=C', 'missense_variant', 'MODERATE', 'EGFR', 'ENSG00000146648', 'transcript', 'ENST00000275493.6', 'protein_coding', '18/28', 'c.2134T>C', 'p.Phe712Leu', '2311/9821', '2134/3633', '712/1210', '', ',C', 'missense_variant', 'MODERATE', 'EGFR', 'ENSG00000146648', 'transcript', 'ENST00000455089.5', 'protein_coding', '17/26', 'c.1999T>C', 'p.Phe667Leu', '2256/384

In [None]:
info_list

In [None]:
for i in info_list:
    info = i.split(',')
    for j in info:
        print(j)
    print('\n')

In [53]:
df_vcf["info"]= info_list

In [None]:
df_vcf

In [59]:
info_list[0]

'DP=144;VDB=0;SGB=99.7196;RPB=0.998513;MQB=1;BQB=0.994064;MQ0F=0;AC=2;AN=4;DP4=5,0,139,0;MQ=60;REF_T=10;ALT_T=46;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Gaa/Aaa|E522K|1068|PIK3CA|protein_coding|CODING|ENST00000263967|10|A),UPSTREAM(MODIFIER||1317|||PIK3CA|retained_intron|CODING|ENST00000462255||A)'

In [60]:
info_list[1]

'DP=443;VDB=0.000411984;SGB=1.31889;RPB=0.999946;MQB=1;BQB=0.886634;MQ0F=0;AC=1;AN=10;DP4=403,0,6,0;MQ=60;REF_T=55;ALT_T=10;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Ttc/Ctc|F712L|1210|EGFR|protein_coding|CODING|ENST00000275493|18|C),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Ttc/Ctc|F667L|1091|EGFR|protein_coding|CODING|ENST00000455089|17|C),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Ttc/Ctc|F667L|1165|EGFR|protein_coding|CODING|ENST00000454757|17|C),DOWNSTREAM(MODIFIER||2948||705|EGFR|protein_coding|CODING|ENST00000344576||C)'