In [15]:
import math
import os
import random

import Bio
import numpy as np
import pandas as pd
from Bio import SeqIO
from sklearn.model_selection import train_test_split

In [2]:
# Set file paths
# Set allseq.fasta file paths
allseq_path = "../remove_outliers/out/allseq.fasta"

# Set alldisorder.fasta file path
alldisorder_path = "../remove_outliers/out/alldisorder.fasta"

# Set cluster file path
cluster_path = "../cluster_seqs/out/allseq.clstr"

In [3]:
# Read in raw cluster output and extract representative protein codes
rep_proteins = []
with open(cluster_path) as file:
    for line in file:
        if "*" in line:
            protein_acc = line.split("|")[0].split(">")[1]
            rep_proteins.append(protein_acc)

In [4]:
# Load in fasta files with biopython
seq_fasta = list(SeqIO.parse(allseq_path, 'fasta'))
disorder_fasta = list(SeqIO.parse(alldisorder_path, 'fasta'))

In [6]:
# Create dict containing all protein info
# (key, value) = ("Q9UPN6", ["Q9UPN6|sequence|SR-rel...","MEAVKTFNSELYSLND...", "100000000000..."])

# First create dict with protein description and seq
all_protein_dict = {}
for entry in seq_fasta:
    protein_acc = entry.description.split("|")[0]
    protein_description = entry.description
    protein_seq = str(entry.seq)
    protein_info = [protein_description, protein_seq]
    all_protein_dict[protein_acc] = protein_info 

# Add disorder to dict
for entry in disorder_fasta:
    protein_acc = entry.description.split("|")[0]
    protein_disorder = str(entry.seq)
    all_protein_dict[protein_acc].append(protein_disorder)

In [7]:
# Create dict containing only info for representative proteins
rep_protein_dict = {}
for key in all_protein_dict:
    if key in rep_proteins:
        rep_protein_dict[key] = all_protein_dict[key]

In [8]:
len(rep_protein_dict)

21772

In [9]:
# Data are now in a dictionary format {'key=ID':'all information, sequence, labels'}
# Convert data into list for shuffling and splitting
rep_protein_list = list(rep_protein_dict.values())
rep_protein_list[1]

['Q92ZS5|sequence OS=Rhizobium meliloti (strain 1021) (Ensifer meliloti) (Sinorhizobiumeliloti) OX=266834',
 'MSDRVKKIESFTLTLPRETPYLGKPRPGEEPNGRGYLVRKANRTVYPTFDRSVLVRIETENGAVGWGETYGLVAPRATMEIIDDLLADFTIGRDPFDAAAIHDDLYDLMRVRGYTGGFYVDALAAIDIALWDLAGKLAGLPVCKLLGGQRRDRIAAYISGLPEDTRAKRAELAAAWQAKGFSSFKFASPVADDGVAKEMEILRERLGPAVRIACDMHWAHTASEAVALIKAMEPHGLWFAEAPVRTEDIDGLARVAASVSTAIAVGEEWRTVHDMVPRVARRALAIVQPEMGHKGITQFMRIGAYAHVHHIKVIPHATIGAGIFLAASLQASAALANVDCHEFQHSIFEPNRRLLVGDMDCLNGEYVVPTGPGLGVEPSKEAQGLLKKH',
 '10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000']

In [12]:
# Data shuffling and splitting
# Set random seed for repeatability and shuffle the data
random.seed(1)
random.shuffle(rep_protein_list)

# Extract by index
train_length = math.ceil(0.8*len(rep_protein_list))
test_length = math.ceil(0.1*len(rep_protein_list))

train = rep_protein_list[:train_length]
test = rep_protein_list[train_length:train_length+test_length]
validation = rep_protein_list[train_length+test_length:]  # Validation gets remainder if split is not even

# Check by length
print(len(train), len(test), len(validation), len(train)+len(test)+len(validation) == len(rep_protein_list))

17418 2178 2176 True


In [13]:
validation[1][0]

'Q9N0N8|sequence|T-cell-specific surface glycoprotein CD28 OS=Canis lupus familiaris (Dog) (Canis familiaris) OX=9615 GN=CD28'

In [16]:
if not os.path.exists('out/'):
    os.mkdir('out/')

# Compile all AA sequences into a fasta file
with open("out/val_as_fasta.fasta", "w") as validation_fastas:
    for i in range(len(validation)):
        validation_fastas.write(">" + validation[i][0] + "\n" + validation[i][1] + "\n")

# Repeat for train 
with open("out/train_as_fasta.fasta", "w") as train_fastas:
    for i in range(len(train)):
        train_fastas.write(">" + train[i][0] + "\n" + train[i][1] + "\n")

# Repeat for test
with open("out/test_as_fasta.fasta", "w") as test_fastas:
    for i in range(len(test)):
        test_fastas.write(">" + test[i][0] + "\n" + test[i][1] + "\n")

In [18]:
# Compile all binary disorder sequences into a fasta file
validation_labels = open("out/val_labels_as_fasta.fasta", "w")
with open("out/val_labels_as_fasta.fasta", "w") as validation_labels:
    for i in range(len(validation)):
        validation_labels.write(">" + validation[i][0] + "\n" + validation[i][2] + "\n")

# Repeat for train 
with open("out/train_labels_as_fasta.fasta", "w") as train_labels:
    for i in range(len(train)):
        train_labels.write(">" + train[i][0] + "\n" + train[i][2] + "\n")

# Repeat for test
with open("out/test_labels_as_fasta.fasta", "w") as test_labels:
    for i in range(len(test)):
        test_labels.write(">" + test[i][0] + "\n" + test[i][2] + "\n")