# **Data preparation**

## **Generating non redundant datasets**

In [4]:
import numpy as np
import pandas as pd
import os

In [5]:
def filter_redundancy(unique_ids, total_dataset):
  base, ext = os.path.splitext(total_dataset)
  output_file = f"{base}_non_redundant{ext}"
  with open(unique_ids, 'r') as clustered_pos, open(total_dataset, 'r') as tot_pos, open(output_file, 'w') as non_redundant_dataset:
    unique_ids = [name.rstrip() for name in clustered_pos]
    non_redundant_dataset.write(
        "".join(
            line.rstrip() + "\n"
            for line in tot_pos
            if line.split('\t')[0] in unique_ids
        ).rstrip("\n")
    )
  return output_file

non_redundant_pos = filter_redundancy('uniq.pos.tsv', 'positive.tsv')
non_redundant_neg = filter_redundancy('uniq.neg.tsv', 'negative.tsv')

## **Dividing in train and test set**

In [6]:
import random

In [7]:
def training_test_split(file):
  with open(file, 'r') as dataset:
    lines = [line.rstrip() for line in dataset]
    random.seed(50)
    random.shuffle(lines)
    n = 80*len(lines)//100
    training_set = lines[:n]
    test_set = lines[n:]
  return training_set, test_set

with open('training_set.tsv', 'w') as training_set, open('test_set.tsv', 'w') as test_set:
  training_set_n, test_set_n = training_test_split('negative_non_redundant.tsv')
  training_set_p, test_set_p = training_test_split('positive_non_redundant.tsv')
  training = training_set_p + training_set_n
  test = test_set_p + test_set_n
  for i in range(len(training)):
    if i < len(training)-1:
      training_set.write(training[i] + '\n')
    else:
      training_set.write(training[i])
  for i in range(len(test)):
    if i < len(test)-1:
      test_set.write(test[i] + '\n')
    else:
      test_set.write(test[i])


In [8]:
def create_validation_sets(total_training):
    n = int((20*len(total_training))/100)
    validation_1 = total_training[:n]
    validation_2 = total_training[n:2*n]
    validation_3 = total_training[2*n:3*n]
    validation_4 = total_training[3*n:4*n]
    validation_5 = total_training[4*n:]
    return validation_1, validation_2, validation_3, validation_4, validation_5

p_validation_1, p_validation_2, p_validation_3, p_validation_4, p_validation_5 = create_validation_sets(training_set_p)
n_validation_1, n_validation_2, n_validation_3, n_validation_4, n_validation_5 = create_validation_sets(training_set_n)
validation_1 = p_validation_1 + n_validation_1
validation_2 = p_validation_2 + n_validation_2
validation_3 = p_validation_3 + n_validation_3
validation_4 = p_validation_4 + n_validation_4
validation_5 = p_validation_5 + n_validation_5

In [9]:
validations = [validation_1, validation_2, validation_3, validation_4, validation_5]

for i, vset in enumerate(validations, start=1):
    filename = f"validation_{i}.tsv"
    with open(filename, "w") as f:
        for x in range(len(vset)):
            if x < len(vset)-1:
              f.write(vset[x] + '\n')
            else:
              f.write(vset[x])


In [10]:
training_df = pd.read_csv('training_set.tsv', sep='\t', header=None)
training_list=training_df[0].tolist()
training_df[5]=0

In [11]:
for i, vset in enumerate(validations, start=1):
  for elem in training_list:
    for el in range(len(vset)):
      if elem == vset[el].split("\t")[0]:
        training_df.loc[training_df[0] == elem, 5] = f"Validation_{i}"
training_df



  training_df.loc[training_df[0] == elem, 5] = f"Validation_{i}"


Unnamed: 0,0,1,2,3,4,5
0,Q03383,Bombyx mori,Metazoa,400,16,Validation_1
1,Q61400,Mus musculus,Metazoa,265,33,Validation_1
2,Q9BZM5,Homo sapiens,Metazoa,246,25,Validation_1
3,P13204,Bos taurus,Metazoa,129,26,Validation_1
4,Q27085,Tachypleus tridentatus,Metazoa,418,24,Validation_1
...,...,...,...,...,...,...
8016,P38210,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,180,False,Validation_5
8017,O74356,Schizosaccharomyces pombe (strain 972 / ATCC 2...,Fungi,585,False,Validation_5
8018,Q6IEG0,Homo sapiens,Metazoa,339,False,Validation_5
8019,F4J284,Arabidopsis thaliana,Viridiplantae,860,False,Validation_5


In [12]:
training_df.to_csv("training_complete.tsv", sep="\t", index=False)