In [1]:
import Bio 
from Bio import SeqIO
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import math

In [2]:
#set file locations

#set allseq.fasta file location
allseq_loc = "../MobiDB Data/Processed/allseq.fasta"
print(f"allseq.fasta file location: '{allseq_loc}'")

#set alldisorder.fasta file location
alldisorder_loc = "../MobiDB Data/Processed/alldisorder.fasta"
print(f"alldisorder.fasta file location: '{alldisorder_loc}'")

#set cluster file location
cluster_loc = "../MobiDB Data/Clustered/1613964195.fas.1.clstr"
print(f"1613964195.fas.1.clstr file location: '{cluster_loc}'")

#set file save location for generated fasta files
save_loc = "../MobiDB Data/Split/"
print(f"save location of fasta files: '{save_loc}'")

allseq.fasta file location: '../MobiDB Data/Processed/allseq.fasta'
alldisorder.fasta file location: '../MobiDB Data/Processed/alldisorder.fasta'
1613964195.fas.1.clstr file location: '../MobiDB Data/Clustered/1613964195.fas.1.clstr'
save location of fasta files: '../MobiDB Data/Split/'


In [3]:
#read in raw cluster output
with open(cluster_loc) as file:
    cluster_text = file.read()

In [4]:
#convert raw cluster output string into list of lines
cluster_text_lines = cluster_text.split("\n")

In [5]:
#extract representative protein codes from raw cluster output

rep_proteins = []

for line in cluster_text_lines:
    if "*" in line:
        protein_acc = line.split("|")[0].split(">")[1]
        rep_proteins.append(protein_acc)

In [6]:
#load in fasta files with biopython
seq_fasta = SeqIO.parse(allseq_loc, 'fasta')
disorder_fasta = SeqIO.parse(alldisorder_loc, 'fasta')

In [7]:
#load fastas into lists
seq_fasta_lst = []
disorder_fasta_lst = []

for entry in seq_fasta:
    seq_fasta_lst.append(entry)
    
for entry in disorder_fasta:
    disorder_fasta_lst.append(entry)

In [8]:
#create dict containing all protein info
#(key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVKTFNSELYSLND...", "100000000000..."])

all_protein_dict = {}

#first create dict with protein description and seq
for entry in seq_fasta_lst:
    protein_acc = entry.description.split("|")[0]
    protein_description = entry.description
    protein_seq = str(entry.seq)
    protein_info = [protein_description, protein_seq]
    all_protein_dict[protein_acc] = protein_info 

#add disorder to dict
for entry in disorder_fasta_lst:
    protein_acc = entry.description.split("|")[0]
    protein_disorder = str(entry.seq)
    all_protein_dict[protein_acc].append(protein_disorder)

In [9]:
#create dict containing only info for representative proteins

rep_protein_dict = {}

for key in all_protein_dict:
    if key in rep_proteins:
        rep_protein_dict[key] = all_protein_dict[key]

In [10]:
len(rep_protein_dict)

15957

In [11]:
#data are now in a dictionary format {'key=ID':'all information, sequence,labels'}
#convert data into a dataframe and then an array for shuffling and splitting
seq_df=pd.DataFrame.from_dict(rep_protein_dict,orient='index')
#convert data into an array
seq_as_array = seq_df.to_numpy()
#format of the array: ([info,sequence,labels],[info,sequence,labels]...)
seq_as_array[1]

array(['H0USY8|sequence OS=Ancylostoma ceylanicum OX=53326 GN=daf-12',
       'GSYQLNAAELQALDLIQEAFKGMNDPMEQGRQATSFLKNEKSPADIMNIMDVTMRRFVKMAKRLPAFNDLSQDGKFALLKGGMIEMLTVRGVRRFDSSSGSWTTPTLGESSEVSINMFDQLNADVRSEQKMRFLQFFKIFHEDIRSNDLVISMIMLIVLFSPRDSITDPEDRRIIARHHEQFSALLNRYLESLYGDDAHQLNEQLPTALRMLREISASSGMLFLGTVNTSEAEPLPREFFKVE',
       '100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'],
      dtype=object)

In [12]:
#reassignment to make sure the array is the original array
seq_as_array = seq_df.to_numpy()
#Data Shuffling and Splitting
#set random seed for repeatability and shuffle the data
np.random.seed(1)
np.random.shuffle(seq_as_array)
#extract by index
train_set_length = math.ceil(0.8*len(seq_as_array))-1 #train set: last item's index should be 80%*total_length-1
validation_length = train_set_length + math.ceil(0.1*len(seq_as_array))
train,validation, test = np.split(seq_as_array,[train_set_length,validation_length])
#check by length
print(len(train),len(validation),len(test),len(train)+len(validation)+len(test)== len(seq_as_array))

12765 1596 1596 True


In [13]:
validation[1][0]

'Q57W60|sequence OS=Trypanosoma brucei brucei (strain 927/4 GUTat10.1) OX=185431'

In [14]:
#compile all AA sequences into a fasta file
validation_fastas = open(save_loc + "val_as_fasta.fasta", "w")

for i in range(len(validation)):
    validation_fastas.write(">" + validation[i][0] + "\n" +validation[i][1] + "\n")

validation_fastas.close()
#repeat for train 
train_fastas = open(save_loc + "train_as_fasta.fasta", "w")

for i in range(len(train)):
    train_fastas.write(">" + train[i][0] + "\n" +train[i][1] + "\n")

train_fastas.close()
#repeat for test
test_fastas = open(save_loc + "test_as_fasta.fasta", "w")

for i in range(len(test)):
    test_fastas.write(">" + test[i][0] + "\n" +test[i][1] + "\n")

test_fastas.close()

In [15]:
#compile all binary disorder sequences into a fasta file
validation_labels = open(save_loc + "val_labels_as_fasta.fasta", "w")

for i in range(len(validation)):
    validation_labels.write(">" + validation[i][0] + "\n" +validation[i][2] + "\n")

validation_labels.close()
#repeat for train 
train_labels= open(save_loc + "train_labels_as_fasta.fasta", "w")

for i in range(len(train)):
    train_labels.write(">" + train[i][0] + "\n" +train[i][2] + "\n")

train_labels.close()
#repeat for test
test_labels = open(save_loc + "test_labels_as_fasta.fasta", "w")

for i in range(len(test)):
    test_labels.write(">" + test[i][0] + "\n" +test[i][2] + "\n")

test_labels.close()