In [None]:
#SynMap2GCcontent
##Step 1 Importing FASTA files

####Import library
import os
from itertools import chain, repeat
import numpy as np
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Bar, Scatter, Figure, Layout
init_notebook_mode(connected=True)


####Global variables 
header = []
nucleotides = []
divide_seq_in = 400


####File input/path
#my_file = open(os.path.expanduser('Plasmodium_chabaudi_chabaudi_strain_AS.faa'))
#my_file = open(os.path.expanduser('Plasmodium__test.faa'))
my_file = open(os.path.expanduser('Plasmodium_chabaudi_chabaudi_TEST2.faa'))
file_contents = my_file.read()
file_contents_lower = file_contents.lower()                                                    #Lowercases all strings in file
file_contents_lower_split = file_contents_lower.split()                                        #Split the whole file string into words
                                                                       

##Step 2 Separating the file from a single string into two lists    
for i in file_contents_lower_split:                                                            #Divides words into two lists: one with the simbol ">" and the other one without it
    if ">" in i:
        header.append(i)
    else:
        nucleotides.append(i)

In [None]:
def window_size(seq):
##Defines a window_size which can be modified in divide_seq_in in global variables
    #win_size = int((len(seq)))
    win_size = int((len(seq))/divide_seq_in)
    return win_size

In [None]:
def seq_split(seq):
##This function divides each sequence into fragments of length determined by the window size
    sub_seq_list = []
    win_size = window_size(seq)
    sub_seq = [seq[nuc:nuc+win_size] for nuc in range(0,len(seq), win_size)]
    sub_seq_list.append(sub_seq)
    return sub_seq_list 

In [None]:
def positioner(seq):
#Loops for every nucleotide per sequence, counts the number of positions and adds them to the position list
    l = 0
    position = []
    for l in range(0,len(seq)):                                                              
        l = l+1
        position.append(l)
        position_array = np.asarray(position)
    return position_array

In [None]:
def AT_counter(seq):
#Count AT% for the sequence and sequence length in the corresponding windows size
    sub_seq = seq_split(seq)
    for frag in sub_seq:
        frag_list_at = []
        frag_lenghts = []
        for n in frag:
            l = len(n)
            frag_lenghts.append(l)
            count_a = n.count('t')
            count_t = n.count('a')
            count_at = count_a+count_t
            percent_at = (float(count_at/l))*100
            frag_list_at.append(percent_at)
            n_at = list(chain.from_iterable([i] * int(round(j * 1)) for i, j in zip(frag_list_at, frag_lenghts)))
            percent_array_at = np.asarray(n_at)
    return percent_array_at

In [None]:
def GC_counter(seq):
#Count GC% for the sequence and sequence length in the corresponding windows size
    sub_seq = seq_split(seq)
    for frag in sub_seq:
        frag_list_gc = []
        frag_lenghts = []
        for n in frag:
            l = len(n)
            frag_lenghts.append(l)
            count_g = n.count('g')
            count_c = n.count('c')
            count_gc = count_g+count_c
            percent_gc = (float(count_gc/l))*100
            frag_list_gc.append(percent_gc)
            n_gc = list(chain.from_iterable([i] * int(round(j * 1)) for i, j in zip(frag_list_gc, frag_lenghts)))
            percent_array_gc = np.asarray(n_gc)
    return percent_array_gc

In [None]:
def N_counter(seq):
#Count N% for the sequence and sequence length in the corresponding windows size
    sub_seq = seq_split(seq)
    for frag in sub_seq:
        frag_list_n = []
        frag_lenghts = []
        for n in frag:
            l = len(n)
            frag_lenghts.append(l)
            count_n = n.count('n')
            percent_n = (float(count_n/l))*100
            frag_list_n.append(percent_n)
            n_n = list(chain.from_iterable([i] * int(round(j * 1)) for i, j in zip(frag_list_n, frag_lenghts)))
            percent_array_n = np.asarray(n_n)
    return percent_array_n

In [None]:
def X_counter(seq):
#Count X% for the sequence and sequence length in the corresponding windows size
    sub_seq = seq_split(seq)
    for frag in sub_seq:
        frag_list_x = []
        frag_lenghts = []
        for n in frag:
            l = len(n)
            frag_lenghts.append(l)
            count_x = n.count('n')
            percent_x = (float(count_x/l))*100
            frag_list_x.append(percent_x)
            n_x = list(chain.from_iterable([i] * int(round(j * 1)) for i, j in zip(frag_list_x, frag_lenghts)))
            percent_array_x = np.asarray(n_x)
    return percent_array_x

In [None]:
def plotly(nucleo):
#Builds a plot using the calculated% for the corresponding window position, 
#creates different tracks for each %,
#and allows to focus on a given position range
    for seq in nucleotides:
        AT_cnt_arr = AT_counter(seq)
        GC_cnt_arr = GC_counter(seq)
        N_cnt_arr = N_counter(seq)
        X_cnt_arr = X_counter(seq)
        pos_arr = positioner(seq)
        
        
        AT_trace = Scatter(
        y = AT_cnt_arr,
        x = pos_arr,
        name='AT content',
        line = dict(color = ('rgb(3,141,243)'))) #Blue
        
        
        GC_trace = Scatter(
        y = GC_cnt_arr,
        x = pos_arr,
        name='GC content',
        line = dict(color = ('rgb(64,182,77)'))) #Green
        
        N_trace = Scatter(
        y = N_cnt_arr,
        x = pos_arr,
        name='N content',
        line = dict(color = ('rgb(243,145,3)'))) #Orange
        
        X_trace = Scatter(
        y = X_cnt_arr,
        x = pos_arr,
        name='X content',
        line = dict(color = ('rgb(171,3,243)'))) #Purple
        
        data = [AT_trace, GC_trace, N_trace, X_trace]   

        layout = dict(xaxis=dict(title='Sequence divided in '+ str(divide_seq_in) +' segments',rangeslider=dict()),yaxis=dict(title='Percentage'))
        fig = dict(data=data, layout=layout)
        iplot(fig)
               
    return


plotly(nucleotides)
