In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from Bio import SeqIO
import os
import subprocess
import shutil
from parse_clusters import parse_clusters
from sequence_cleaning import remove_X, replace_amino_acids

In [None]:
# Clean: remove sequences with X
dataset = SeqIO.parse("../datasets/InterProUniprotPF03272.fasta", "fasta")
dataset_xremoved = remove_X(dataset)
with open("../datasets/InterProUniprotPF03272_Xremoved.fasta", 'w') as f:
        SeqIO.write(dataset_xremoved, f, "fasta")

In [6]:
# Cluster the dataset
devnull = open(os.devnull, 'w')
subprocess.call(['bash', './clustering.sh'], stdout=devnull)

0

In [9]:
# Split into train/validation/test
clusters = parse_clusters("../datasets/clustered/InterProUniprotPF03272_95_all_seqs.fasta")
representatives = [c.members[0] for c in clusters]

random_seed = 123456

train_val_set, test_set = train_test_split(representatives, test_size=0.1, shuffle=True, random_state=random_seed)
train_set, validation_set = train_test_split(train_val_set, test_size=1/9, shuffle=True, random_state=random_seed)

In [10]:
# Replace ambigous AA
prepared_train_set = replace_amino_acids(random_seed, train_set)
prepared_validation_set = replace_amino_acids(random_seed, validation_set)
prepared_test_set = replace_amino_acids(random_seed, test_set)

with open("../datasets/InterProUniprotPF03272_95_train.fasta", "w") as handle:
    SeqIO.write(prepared_train_set, handle, "fasta")

with open("../datasets/InterProUniprotPF03272_95_validation.fasta", "w") as handle:
    SeqIO.write(prepared_validation_set, handle, "fasta")

with open("../datasets/InterProUniprotPF03272_95_test.fasta", "w") as handle:
    SeqIO.write(prepared_test_set, handle, "fasta")

In [None]:
# Cleanup
shutil.rmtree("../datasets/clustered")
shutil.rmtree("./tmp")