In [None]:
import pandas as pd
import numpy as np
from gtfparse import read_gtf
import pysam as ps

from sdhb_functions import *

## SDHB Reference Sequence

In [None]:
# gtf_df = read_gtf('Hsapiens/GRCh38/RefSeq/annotation_110/GRCh38_latest_genomic.gtf')
# gtf_df[gtf_df['gene_id']=='SDHB'].to_pickle('01_New_reference_SDHB/sdhb_gtf_hg38.gtf')

In [None]:
gtf_sdbh = pd.read_pickle('01_New_reference_SDHB/sdhb_gtf_hg38.gtf')
ref_fa = ps.FastaFile('Hsapiens/GRCh38/RefSeq/assembly_GCF_000001405.40/GRCh38_latest_genomic.fna')

### Fasta reference sequence

In [None]:
# plasmid sequence at intron 5
s1 = 'TAAACTTAAGCTTGGTACCGAGCTCG'

# intron 5: 202 bp
s2 = ref_fa.fetch('NC_000001.11', 17027749-202-1, 17027749-1)

# exon 4 complete, intron 4 complete, exon 5 complete
s3 = ref_fa.fetch('NC_000001.11', 17027749-1, 17028736)

# intron 3: - 325 bp
s4 = ref_fa.fetch('NC_000001.11', 17028736, 17028736+325)

# intron 3: + 324 bp
s5 = ref_fa.fetch('NC_000001.11', 17033060-324-1, 17033060-1)

# exon 3: complete
s6 = ref_fa.fetch('NC_000001.11', 17033060-1, 17033145)

# intron 2: - 325 bp
s7 = ref_fa.fetch('NC_000001.11', 17033145, 17033145+325)

# intron 2: + 325 bp
s8 = ref_fa.fetch('NC_000001.11', 17044761-325-1, 17044761-1)

# exon 2: complete
s9 = ref_fa.fetch('NC_000001.11', 17044761-1, 17044888)

# intron 1: 202 bp
s10 = ref_fa.fetch('NC_000001.11', 17044888, 17044888+202)

# plasmid sequence at intron 1:
s11 = 'GTCTAGAGGGCCCGTTTAAACGCTAGCCAGCTT'

In [None]:
s_e8 = ref_fa.fetch('NC_000001.11', 17018722-1, 17027749-202-1)

s_i5_i2 = ref_fa.fetch('NC_000001.11', 17027749-202-1, 17044888+202)

s_e1 = ref_fa.fetch('NC_000001.11', 17044888+202, 17054032)

sdhb_ref = s_e8 + s1 + s_i5_i2 + s11 + s_e1

In [None]:
with open('01_New_reference_SDHB/sdhb_minigene_ref.fa', 'w') as o:
    o.write('>chrSDHB_minigene\n')
    o.write(sdhb_ref.upper())

### GTF annotation file SDHB

In [None]:
altered_gtf_sdhb = gtf_sdbh.copy().reset_index(drop=True)
altered_gtf_sdhb = altered_gtf_sdhb[:10]

altered_gtf_sdhb.loc[altered_gtf_sdhb['seqname']=='NC_000001.11', 'seqname'] = 'chrSDHB_minigene'

altered_gtf_sdhb.loc[altered_gtf_sdhb.index==1, 'transcript_id'] = 'MG_unspliced'
altered_gtf_sdhb.loc[altered_gtf_sdhb['feature']=='exon', 'transcript_id'] = 'MG_unspliced'

altered_gtf_sdhb.loc[altered_gtf_sdhb['feature']=='gene', 'start'] = 1
altered_gtf_sdhb.loc[altered_gtf_sdhb['feature']=='gene', 'end'] = 35370

altered_gtf_sdhb.loc[altered_gtf_sdhb['exon_number']=='3', 'start'] = 8826
altered_gtf_sdhb.loc[altered_gtf_sdhb['exon_number']=='3', 'end'] = 10367

altered_gtf_sdhb.loc[altered_gtf_sdhb['exon_number']=='2', 'start'] = 14041
altered_gtf_sdhb.loc[altered_gtf_sdhb['exon_number']=='2', 'end'] = 14776

altered_gtf_sdhb.loc[altered_gtf_sdhb['exon_number']=='1', 'start'] = 25741
altered_gtf_sdhb.loc[altered_gtf_sdhb['exon_number']=='1', 'end'] = 26429

altered_gtf_sdhb.loc[altered_gtf_sdhb['feature']=='transcript', 'start'] = 8826
altered_gtf_sdhb.loc[altered_gtf_sdhb['feature']=='transcript', 'end'] = 26429

altered_gtf_sdhb = altered_gtf_sdhb.drop(index=[5,6,7,8,9])

In [None]:
transcr_row = altered_gtf_sdhb[altered_gtf_sdhb['feature']=='transcript']
altered_gtf_sdhb = altered_gtf_sdhb.append(transcr_row, ignore_index=True)
exon_row = altered_gtf_sdhb[altered_gtf_sdhb['exon_number']=='1']
altered_gtf_sdhb = altered_gtf_sdhb.append([exon_row]*5, ignore_index=True)
altered_gtf_sdhb.loc[altered_gtf_sdhb.index.isin([5,6,7,8,9,10]), 'transcript_id'] = 'MG_spliced'
for i in range(5):
    altered_gtf_sdhb.loc[altered_gtf_sdhb.index==6+i, 'exon_number'] = str(1+i)

In [None]:
altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='5')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'start'] = 8826
altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='5')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'end'] = 9171

altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='4')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'start'] = 9905
altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='4')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'end'] = 10042

altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='3')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'start'] = 14365
altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='3')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'end'] = 14451

altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='2')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'start'] = 26066
altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='2')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'end'] = 26194

altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='1')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'start'] = 26294
altered_gtf_sdhb.loc[(altered_gtf_sdhb['exon_number']=='1')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'end'] = 26429

altered_gtf_sdhb.loc[(altered_gtf_sdhb['feature']=='transcript')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'start'] = 8826
altered_gtf_sdhb.loc[(altered_gtf_sdhb['feature']=='transcript')&(altered_gtf_sdhb['transcript_id']=='MG_spliced'),
                     'end'] = 26429

In [None]:
gene_attr = 'gene_id "SDHB"; gbkey "Gene"; gene "SDHB"; gene_biotype "protein_coding";'
transcript_unspl_attr = 'gene_id "SDHB"; transcript_id "MG_unspliced"; gbkey "mRNA"; gene "SDHB"; transcript_biotype "SDHB";'
exon_1_unspl_attr = 'gene_id "SDHB"; transcript_id "MG_unspliced"; gene "SDHB"; transcript_biotype "SDHB"; exon_number "1";'
exon_2_unspl_attr = 'gene_id "SDHB"; transcript_id "MG_unspliced"; gene "SDHB"; transcript_biotype "SDHB"; exon_number "2";'
exon_3_unspl_attr = 'gene_id "SDHB"; transcript_id "MG_unspliced"; gene "SDHB"; transcript_biotype "SDHB"; exon_number "3";'
transcript_spl_attr = 'gene_id "SDHB"; transcript_id "MG_spliced"; gbkey "mRNA"; gene "SDHB"; transcript_biotype "SDHB";'
exon_1_spl_attr = 'gene_id "SDHB"; transcript_id "MG_spliced"; gene "SDHB"; transcript_biotype "SDHB"; exon_number "1";'
exon_2_spl_attr = 'gene_id "SDHB"; transcript_id "MG_spliced"; gene "SDHB"; transcript_biotype "SDHB"; exon_number "2";'
exon_3_spl_attr = 'gene_id "SDHB"; transcript_id "MG_spliced"; gene "SDHB"; transcript_biotype "SDHB"; exon_number "3";'
exon_4_spl_attr = 'gene_id "SDHB"; transcript_id "MG_spliced"; gene "SDHB"; transcript_biotype "SDHB"; exon_number "4";'
exon_5_spl_attr = 'gene_id "SDHB"; transcript_id "MG_spliced"; gene "SDHB"; transcript_biotype "SDHB"; exon_number "5";'

In [None]:
attributes_list = [gene_attr, transcript_unspl_attr, exon_1_unspl_attr, exon_2_unspl_attr, exon_3_unspl_attr, 
                   transcript_spl_attr, exon_1_spl_attr, exon_2_spl_attr, exon_3_spl_attr, exon_4_spl_attr, 
                   exon_5_spl_attr]
altered_gtf_sdhb['attributes'] = attributes_list

In [None]:
# prepare dataframe for gtf file format
altered_gtf_sdhb['score'] = altered_gtf_sdhb['score'].replace(np.nan, '.')
altered_gtf_sdhb.loc[altered_gtf_sdhb['source']=='', 'source'] = 'altered_ref_SDHB_minigene'

In [None]:
altered_gtf_sdhb[['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
                ].to_csv('01_New_reference_SDHB/altered_gtf_sdhb_mg_unspliced_spliced_together.gff', sep='\t', index=False, header=False)

In [None]:
altered_gtf_sdhb[['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
                ].to_csv('01_New_reference_SDHB/altered_gtf_sdhb_mg_unspliced_spliced_together.gtf', sep='\t', index=False, header=False)


In [None]:
altered_gtf_sdhb[altered_gtf_sdhb.index.isin([0,5,6,7,8,9,10])][['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
                ].to_csv('01_New_reference_SDHB/altered_gtf_sdhb_mg_only_spliced.gtf', sep='\t', index=False, header=False)

In [None]:
altered_gtf_sdhb[altered_gtf_sdhb.index.isin([0,5,6,7,8,9,10])][['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
                ].to_csv('01_New_reference_SDHB/altered_gtf_sdhb_mg_only_spliced.gff', sep='\t', index=False, header=False)