In [None]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import re
from gtfparse import read_gtf
import math
import os
import pickle
import matplotlib.pyplot as plt
import matplotlib.patches as pplt
from functools import reduce
import pysam as ps

from sdhb_functions import *

### Run SpliceAI with ALT and REF Score, expanded distance around variant
https://github.com/bw2/SpliceAI/tree/master


In [None]:
syn_var_files = ['synth_sdhb_exon_variants.txt', 'synth_sdhb_intron_100_variants.txt', 
                 'synth_sdhb_intron_200_variants.txt', 'synth_sdhb_intron_300_variants.txt', 
                 'synth_sdhb_intron_400_variants.txt']

syn_var_path = '01_syn_tables_out/'

gtf_sdbh_all = pd.read_pickle('00_sdhb_seq_dfs/sdhb_gtf.gtf')
gtf_sdbh = gtf_sdbh_all[(gtf_sdbh_all['feature']=='exon')&(gtf_sdbh_all['exon_number'].isin(['2','3','4','5']))][
    ['feature','start','end','exon_number']].copy()

In [None]:
syn_dfs = [pd.read_table(syn_var_path+f, names=['identifier']) for f in syn_var_files]

for df in syn_dfs:
    df['#CHROM'] = df['identifier'].str.split(':', expand=True)[0]
    df['POS'] = df['identifier'].str.split('g.', expand=True)[1].str.extract('(\d+)').astype(int)
    df['ID'] = '.'
    df['REF'] = df['identifier'].str.split('g.', expand=True)[1].str.split('>', expand=True
                                                                          )[0].str.extract('([A-Z]+)')
    df['ALT'] = df['identifier'].str.split('>', expand=True)[1]
    df[['QUAL','FILTER','INFO']] = '.'
    df.drop(columns='identifier', inplace=True)
    
syn_df_vcf = reduce(lambda left, right: pd.merge(left, right, how='outer'), syn_dfs)
syn_df_vcf = syn_df_vcf.sort_values('POS', ignore_index=True)

In [None]:
syn_300_vcf = syn_df_vcf[((syn_df_vcf['POS']>=17371256-300)&(syn_df_vcf['POS']<=17371383+300))|
                         ((syn_df_vcf['POS']>=17359555-300)&(syn_df_vcf['POS']<=17359640+300))|
                         ((syn_df_vcf['POS']>=17355095-300)&(syn_df_vcf['POS']<=17355231+300))|
                         ((syn_df_vcf['POS']>=17354244-300)&(syn_df_vcf['POS']<=17354360+300))
                         ].copy().reset_index(drop=True)

In [None]:
syn_300_vcf.to_csv('01_syn_tables_out/synth_sdhb_vars.vcf', mode='a', index=False, sep='\t')

In [None]:
# last +/-325 variants (only 25 positions necessary, because up to 300 already run)
syn_last_25_vcf = syn_df_vcf[((syn_df_vcf['POS']>=17371256-325)&(syn_df_vcf['POS']<17371256-300))|
                             ((syn_df_vcf['POS']>17359640+300)&(syn_df_vcf['POS']<=17359640+325))|
                             ((syn_df_vcf['POS']>=17359555-325)&(syn_df_vcf['POS']<17359555-300))|
                             ((syn_df_vcf['POS']>17355231+300)&(syn_df_vcf['POS']<=17355231+325))|
                             ((syn_df_vcf['POS']>=17355095-325)&(syn_df_vcf['POS']<17355095-300))|
                             ((syn_df_vcf['POS']>17354360+300)&(syn_df_vcf['POS']<=17354360+325))
                            ].copy().reset_index(drop=True)

In [None]:
with open('01_syn_tables_out/synth_325_vars.vcf', 'w') as o:
    o.write('''##fileformat=VCFv4.2
##fileDate=20191004
##reference=GRCh37/hg19
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=243199373>
##contig=<ID=3,length=198022430>
##contig=<ID=4,length=191154276>
##contig=<ID=5,length=180915260>
##contig=<ID=6,length=171115067>
##contig=<ID=7,length=159138663>
##contig=<ID=8,length=146364022>
##contig=<ID=9,length=141213431>
##contig=<ID=10,length=135534747>
##contig=<ID=11,length=135006516>
##contig=<ID=12,length=133851895>
##contig=<ID=13,length=115169878>
##contig=<ID=14,length=107349540>
##contig=<ID=15,length=102531392>
##contig=<ID=16,length=90354753>
##contig=<ID=17,length=81195210>
##contig=<ID=18,length=78077248>
##contig=<ID=19,length=59128983>
##contig=<ID=20,length=63025520>
##contig=<ID=21,length=48129895>
##contig=<ID=22,length=51304566>
##contig=<ID=X,length=155270560>
##contig=<ID=Y,length=59373566>
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO\n''')


#### Run SpliceAI in command line
with distance 2,000 bp

OMP_NUM_THREADS=10 nice -5 spliceai -I synth_sdhb_vars.vcf -O synth_sdhb_vars_spliceai_out.vcf -R RefSeq.fa -D 2000
