# Search ORF associated with RAN translation

In [1]:
#import necessary libraries
import re
import pandas as pd
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
import numpy as np

### Uploading and pre-precosseing dataset

In [2]:
df = pd.read_csv('norm_database.txt', sep='\t', index_col=0)
df = df.fillna('')
df = df.loc[:, ['FlankingLeft4000','FlankingRight4000', 'ArraySequence', 'GeneAttribute', 'Pattern', 'CopyNumber']]
df.iloc[:, 0:3].applymap(str)
df.iloc[:, 0] = df.iloc[:, 0].str.upper()
df.iloc[:, 1] = df.iloc[:, 1].str.upper()
df.iloc[:, 2] = df.iloc[:, 2].str.upper()
df.columns = ['Left', 'Right', 'Rep', 'GeneID', 'Pattern', 'CopyNumber']

### Pipeline for finding ORF

In [51]:
def get_coord(left_seq, repeat, right_seq):
    """
    Get starts and ends of ORF
    :left_seq: sequence left to repeats
    :repeat: sequence of repeats
    :right_seq: sequence right to repeats
    :return: list with starts and ends for sequence
    """
    full = left_seq+repeat+right_seq #full string
    #start=r'GCC[AG]CC[CGA][TC][GCT]G'
    start = r'(CTG|GTG|ACG|ATT|ATC|GCG)' #Start codons for 
    starts = starts_row(left_seq, start) #Get all start codons
    ends = stops(starts, full, repeat, left_seq) #Get end for ORF
    return(starts, ends)
    

def starts_row(left_seq, pattern):
    """
    Return list wirh start codons coords
    :left_seq: sequence left to repeats
    :pattern: regular expression for finding start codons
    :return: list with all starts codons
    """
    left = left_seq
    lst = []
    for i in range(len(left)-3):
        if re.match(pattern,left[i:i+3]):
            if filter_starts_before_repeat(left_seq, i+1): #filter starts codons, which ends before repeats
                lst.append(i+1)
            else:
                continue
    return(lst)

def filter_starts_before_repeat(seq, start):
    """
    Check if ORF ends before repeats
    :seq: sequence left to repeats
    :start: start codon coord
    """
    stop = ['TAG', 'TAA', 'TGA']
    for i in range(start+2, len(seq), 3):
        if seq[i:i+3] in stop:
            return False
    if len(seq)-start <= 75:
        return True
    else:
        return False

def stops(coords, fulls, rep, left):
    """
    Return list wirh start codons coords
    
    :return: list with all stop codons of ORF
    """
    stop = ['TAG', 'TAA', 'TGA']
    lim = len(left)
    stops = []
    for crd in coords:
        for i in range(crd+2, len(fulls), 3):
            if fulls[i:i+3] in stop:
                if (i+3-lim)>=len(rep):
                    stops.append(i+3)
                    break
                else:
                    stops.append(0)
                    break
    return stops

def get_qudra_f(gen):
    """
    Check if g-quadruplex in seq
    """
    sq = Seq(gen, generic_dna)
    pattern = "(G{3,4}[AUGC]{1,12}){4,4}"
    sq = sq.complement().transcribe()
    if re.search(pattern, str(sq)):    
        return 1
    else:
        return 0


def get_frame(id_s, left, rep, right, patt=None, copy=None):
    """
    Return dataframe with ORF
    
    :left: left before repeats
    :rep: repeats
    :right: right after repeats
    :return: dataframe with ids and ORF sequence
    """
    id_s = id_s.split('.')
    starts, ends = get_coord(left, rep, right)
    full = left+rep+right
    data =  {'GeneID':[], 'Genome':[],  'Chr':[], 'Coord':[], 'Pattern':[],
             'Copy':[], 'Is_causac':[], 'Is_gquad':[]}
    for start, end in zip(starts, ends):
        if end != 0:
            data['Chr'].append(id_s[1])
            data['GeneID'].append(id_s[0])
            data['Coord'].append(id_s[2])
            data['Genome'].append(full[start-1:end])
            if patt:
                data['Pattern'].append(patt)
            if copy:
                data['Copy'].append(copy) 
            data['Is_causac'].append(1) if full[start-1:end][3] == 'G' else data['Is_causac'].append(0)
            data['Is_gquad'].append(get_qudra_f(full[start-1:end]))
        else:
            continue
    return pd.DataFrame(data)

### Retrieve dataframe with ORFs

In [52]:
df_final = pd.DataFrame(columns=['GeneID', 'Chr', 'Coord', 'Genome'])
for ids, left, rep, right, patt, copy in zip(df.loc[:,'GeneID'],  
                             df.loc[:,'Left'],
                             df.loc[:, 'Rep'],                
                             df.loc[:,'Right'],                
                             df.loc[:,'Pattern'],
                             df.loc[:, 'CopyNumber']):
    df_final = pd.concat([df_final, get_frame(ids, left, rep, right, patt, copy)], axis=0)

### Check if g-quadruplex in sequences

In [39]:
from Bio import SeqIO
pattern = "(G{2,4}[AUGC]{1,12}){4,4}"
# Write fasta with g quadruoles
for record in SeqIO.parse("multifasta_ORF_lit.fasta", "fasta"):
    if re.search(pattern, str(record.seq)):
        print('>'+record.id, file=open('quad_seq.fasta', 'a'))
        print(record.seq, file=open('quad_seq.fasta', 'a'))
        print(file=open('quad_seq.fasta', 'a'))

### Retrieve ORFs from files with sequences

In [33]:
files = ['DMPK_496.txt',
         'FMR1_100.txt',
         'HTT_196.txt',
         'ATXN8_325.txt',
         'С9orf72_337.txt']
dct = {
    'Left':[],
    'Right':[],
    'ID':[],
    'Full':[],
    'Rep':[]
}
for file in files:
    bord = int(re.search('_([0-9]*).', file).group(1))
    ids = re.search('([A-Za-z0-9]*)_', file).group(1)
    with open(file, 'r') as f:
        line = f.readline().strip()
        dct['ID'].append(ids)
        dct['Left'].append(line[0:bord])
        dct['Right'].append(line[len(line)-bord:])
        dct['Rep'].append(line[bord-1:len(line)-bord])
        dct['Full'].append(line)
    
    
mult_fast = pd.DataFrame(dct)    

In [43]:
def get_frame_fasta(id_s, left, rep, right, full):
    id_s = id_s.split('.')
    starts, ends = get_coord(left, rep, right)
    data =  {'ID':[], 'Genome':[],'Is_causac':[], 'Is_gquad':[]}
    for start, end in zip(starts, ends):
        data['ID'].append(id_s[0])
        data['Genome'].append(full[start-1:end])
        data['Is_causac'].append(1) if full[start-1:end][3] == 'G' else data['Is_causac'].append(0)
        data['Is_gquad'].append(get_qudra_f(full[start-1:end]))
    return pd.DataFrame(data)

df_final_fasta = pd.DataFrame(columns=['ID', 'Genome', 'Is_causac',  'Is_gquad'])
for ids, left, rep, right, full in zip(mult_fast.loc[:,'ID'],  
                             mult_fast.loc[:,'Left'],
                             mult_fast.loc[:, 'Rep'],                
                             mult_fast.loc[:,'Right'],
                             mult_fast.loc[:,'Full']
                             ):
    df_final_fasta = pd.concat([df_final_fasta, get_frame_fasta(ids, left, rep, right, full)], axis=0)

In [37]:
df_final_fasta.to_csv('it_is_ok_now.csv', index=False)

### Get scores for suences before and after repeats

In [44]:
# Score for sequence after repeats
right ={
    0:({'C': 0.6, 'T':0.6}),
    1:({'A':0.56,'G':0.31}),
    2:({'A':0.91,'G':0.91}),
    3:({'A':0.94,'G':0.94, 'T':0.94}),
    4:({'A':0.94,'G':0.94, 'C':0.94}),
    5:({'G':0.56,'C':1, 'A':1}),
    8:({'G':0.44}),
    9:({'C':1,'G':1,'T':1}),
    17:({'G':0.44}),
    26:({'G':0.44})
}

In [105]:
#Get scores for aligned sequences
dict_right = {
    'Score_Right':[]
}
for record in SeqIO.parse("RIGHT_UNI_FASTA.fa", "fasta"):
    score = 0
    sc = record.seq
    for keys, values in right.items():
        if sc[keys] in values:
            score += values[sc[keys]]
    dict_right['Score_Right'].append(score)

In [51]:
# Score for sequence before repeats
left = {
    0:({'G':1}),
    1:({'C':0.60,'A':0.3}),
    2:({'A':0.30,'G':0.60}),
    3:({'A':0.30,'G':0.60}),
    4:({'C':0.5}),
    5:({'G':0.63}),
    6:({'G':0.55}),
    7:({'C':0.44}),
    8:({'G':0.56}),
    9:({'G':0.9, 'A':0.9}),
    10:({'C': 0.6, 'T':0.6}),
    12:({'G':0.45}),
    15:({'G':0.45}),
    17:({'T':0.45}),
    21:({'G':0.45})
}

In [106]:
#Get scores for aligned sequences
dict_left = {
    'Score_Left':[]
}
for record in SeqIO.parse("LEFT_UNI_FASTA.fa", "fasta"):
    score = 0
    sc = record.seq
    for keys, values in left.items():
        if sc[keys] in values:
            score += values[sc[keys]]
    dict_left['Score_Left'].append(score)

In [107]:
# append score to dataframe
lft = pd.DataFrame(dict_left)
rgt = pd.DataFrame(dict_right)
scores = pd.concat([lft, rgt], axis = 1)
df_final.index = scores.index
df_final = pd.concat([df_final, scores], axis=1)

In [108]:
# get final file
df_final.to_csv('with_scores.csv')