In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.ndimage as ndimage
from Bio import SeqIO

In [2]:
fasta_seq = SeqIO.parse('../mobidb_validation/generate_fastas/out/allseq.fasta', 'fasta')
fasta_disorder = SeqIO.parse('../mobidb_validation/generate_fastas/out/alldisorder.fasta', 'fasta')

In [3]:
# making dictionary with key-value pair, 'protein.id' : [protein_amino_acid_sequence, protein_label] 

protein_dict = {}
for protein in fasta_seq:
    protein_dict[protein.id.split("|")[0]] = [str(protein.seq)]

for protein in fasta_disorder:
    accession = protein.id.split("|")[0]
    protein_dict[accession] = protein_dict.get(accession) + [str(protein.seq)] + [protein.description]

In [4]:
# Load sequences (into dictionary)
# Load corresponding labels (into dictionary)
# EXTRACT DATA
# make empty list to store sequence + context + header + labels (tuple)
# for each sequence
    # Pull out the labels
    # Find disordered segments within certain length threshold
    # For each segment in that protein
        # Check for running off the edge of the sequence
            # Pull out the corresponding sequence + context on either side
            # Append sequence + context + labels + header + region indices (tuple)
# OUTPUT INTO FILE

In [5]:
labels_file = open("label.fasta", "w+")
unmasked_seq_file = open("unmasked_seq_file.fasta", "w+")

In [6]:
dis_lower_limit = 30
dis_upper_limit = 90
len_residue = 180

for protein_id in protein_dict:
    
    label = protein_dict.get(protein_id)[1]
    dis_labels = [s == '1' for s in label]
    
    slices = ndimage.find_objects(ndimage.label(dis_labels)[0])
    
    # check to see if there are disordered sequences
        
    for s in slices:
        len_seg = len(label[s[0]])
        #check to see if write disordered region length
        if len_seg >= dis_lower_limit and len_seg <= dis_upper_limit:
            #calculating context
            len_context = (len_residue - len_seg) // 2
            
            len_remainder = (len_residue - len_seg) % 2 
            
            start_ind = s[0].start - len_context 
            end_ind = s[0].stop + len_context
            
            #seqstring = '\n'.join([seq[i:i+80] for i in range(0, len(seq), 80)]) + '\n'
            #labelstring = '\n'.join([label[i:i+80] for i in range(0, len(label), 80)]) + '\n'
            
            output_labels = label[slice(start_ind, end_ind + len_remainder)]
            output_aaseq = protein_dict.get(protein_id)[0][slice(start_ind, end_ind + len_remainder)]
            
            if len(output_labels) == 180: 
                
                labels_file.write(">" + protein_dict.get(protein_id)[2] + "|" + str(start_ind) + ":" + str(end_ind) + "\n"
                                 + '\n'.join([output_labels[i:i+80] for i in range(0, len(output_labels), 80)]) + "\n")
                
                unmasked_seq_file.write(">" + protein_dict.get(protein_id)[2] + "|" + str(start_ind) + ":" + str(end_ind) 
                                       + '\n'.join([output_aaseq[i:i+80] for i in range(0, len(output_aaseq), 80)]) + "\n")

labels_file.close()
unmasked_seq_file.close()