In [None]:
import pandas as pd 

In [None]:
AA_DICT = {
    'Gly' : 'G',
    'Ala' : 'A',
    'Val' : 'V',
    'Leu' : 'L',
    'Ile' : 'I',
    'Phe' : 'F',
    'Trp' : 'W',
    'Tyr' : 'Y',
    'Asp' : 'D',
    'Asn' : 'N',
    'Glu' : 'E',
    'Lys' : 'K',
    'Gln' : 'Q',
    'Met' : 'M',
    'Ser' : 'S',
    'Thr' : 'T',
    'Cys' : 'C',
    'Pro' : 'P',
    'His' : 'H',
    'Arg' : 'R',
}

In [None]:
def parse_variant_ann(variant_ann_file):
    """
    Args:
        variant_ann_file: variant annotation file from snpEff.
    
    Returns:
        gene_list, mRNA_list, protein_list
    """
    gene_list, mRNA_list, protein_list = [], [], []

    with open(variant_ann_file) as f:
        for line in f.readlines():
            if not line.startswith("#"):
                info = line.split('\t')[7]
                anns = info.split("|")
                gene = anns[3]
                gene_list.append(gene)
            
                tmp1, tmp2 = [], []
                for ann in anns:
                    if ann.startswith("c."):
                        exon_loc = anns[anns.index(ann) - 1].split('/')[0]
                        # WARNING_TRANSCRIPT_INCOMPLETE
                        if not exon_loc:
                            continue
                        
                        exon = ann.strip("c.")
                        exon = f"exon{exon_loc}:{exon}"
                        if exon not in tmp1:
                            tmp1.append(exon)

                    if ann.startswith("p."):
                        protein = ann[2:]
                        for i in AA_DICT:
                            protein = protein.replace(i, AA_DICT[i])
                        if protein not in tmp2:
                            tmp2.append(protein)
                        
                mRNA_list.append(','.join(tmp1))
                protein_list.append(','.join(tmp2))

    return (gene_list, mRNA_list, protein_list)

In [None]:
df_gt = pd.read_csv("/SGRNJ06/randd/USER/cjj/snpeff/20230608error/A124_FJ/09.analysis_snp/A124_FJ_gt.csv", keep_default_na=False, index_col=0)
df_ncell = pd.read_csv("/SGRNJ06/randd/USER/cjj/snpeff/20230608error/A124_FJ/09.analysis_snp/A124_FJ_variant_ncell.csv", index_col=0)

In [None]:
df_gt

In [None]:
df_ncell

In [None]:
df_ncell['n_variants'] = df_ncell['0/1'] + df_ncell['1/1']

In [None]:
df_ncell

In [None]:
df_top = df_gt.loc[df_ncell.nlargest(20, 'n_variants').index,]

In [None]:
df_ncell.nlargest(20, 'n_variants')

In [None]:
df_top

In [None]:
df_top = df_top.transpose()

In [None]:
variants = df_top.columns

In [None]:
variants

In [None]:
for c in variants:
    df_top[c] = df_top[c].astype('category')

In [None]:
df_top

In [None]:
gene_list, _mRNA_list, protein_list = parse_variant_ann("/SGRNJ06/randd/USER/cjj/snpeff/20230608error/A124_FJ/09.analysis_snp/A124_FJ_final.vcf")

In [None]:
variants

In [None]:
variant_table = pd.read_csv("/SGRNJ06/randd/USER/cjj/snpeff/20230608error/A124_FJ/09.analysis_snp/A124_FJ_variant_table.csv")

In [None]:
variant_table

In [None]:
variant_table['Chrom'] = variant_table['Chrom'].astype(str)
variant_table['Pos'] = variant_table['Pos'].astype(str)
variant_table['Chrom_Pos'] = variant_table[['Chrom', 'Pos']].apply('_'.join, axis=1)

In [None]:
variant_table

In [None]:
variant_table = variant_table.fillna('None')
gene_dict = variant_table.set_index("Chrom_Pos").to_dict(orient="dict")["Gene"]
protein_dict = variant_table.set_index("Chrom_Pos").to_dict(orient="dict")["Protein"]

In [None]:
len(protein_dict)

In [None]:
        for i, v in enumerate(variants):
            title = f'top{i+1}_{variants[i]}_{gene_dict[variants[i]]}_{protein_dict[variants[i]]}'
            print(title)

In [None]:
df_top

In [None]:
len(gene_list)

In [None]:
len(protein_list)