# **Data preparation**

## **Generating non redundant datasets**

In [2]:
import numpy as np
import pandas as pd
import os

In [7]:
def filter_redundancy(unique_ids, total_dataset):
  base, ext = os.path.splitext(total_dataset)
  output_file = f"{base}_non_redundant{ext}"
  with open(unique_ids, 'r') as clustered_pos, open(total_dataset, 'r') as tot_pos, open(output_file, 'w') as non_redundant_dataset:
    unique_ids = [name.rstrip() for name in clustered_pos]
    non_redundant_dataset.write(
        "".join(
            line.rstrip() + "\n"
            for line in tot_pos
            if line.split('\t')[0] in unique_ids
        ).rstrip("\n")
    )
  return output_file

non_redundant_pos = filter_redundancy('uniq.pos.tsv', 'positive.tsv')
non_redundant_neg = filter_redundancy('uniq.neg.tsv', 'negative.tsv')

## **Dividing in train and test set**

In [10]:
import random

In [14]:
def training_test_split(file):
  with open(file, 'r') as dataset:
    lines = [line.rstrip() for line in dataset]
    random.seed(50)
    random.shuffle(lines)
    n = 80*len(lines)//100
    training_set = lines[:n]
    test_set = lines[n:]
  return training_set, test_set

with open('training_set.tsv', 'w') as training_set, open('test_set.tsv', 'w') as test_set:
  training_set_n, test_set_n = training_test_split('negative_non_redundant.tsv')
  training_set_p, test_set_p = training_test_split('positive_non_redundant.tsv')
  training = training_set_p + training_set_n
  test = test_set_p + test_set_n
  for i in range(len(training)):
    if i < len(training)-1:
      training_set.write(training[i] + '\n')
    else:
      training_set.write(training[i])
  for i in range(len(test)):
    if i < len(test)-1:
      test_set.write(test[i] + '\n')
    else:
      test_set.write(test[i])


In [21]:
def create_validation_sets(total_training):
    n = int((20*len(total_training))/100)
    validation_1 = total_training[:n]
    validation_2 = total_training[n:2*n]
    validation_3 = total_training[2*n:3*n]
    validation_4 = total_training[3*n:4*n]
    validation_5 = total_training[4*n:]
    return validation_1, validation_2, validation_3, validation_4, validation_5

p_validation_1, p_validation_2, p_validation_3, p_validation_4, p_validation_5 = create_validation_sets(training_set_p)
n_validation_1, n_validation_2, n_validation_3, n_validation_4, n_validation_5 = create_validation_sets(training_set_n)
validation_1 = p_validation_1 + n_validation_1
validation_2 = p_validation_2 + n_validation_2
validation_3 = p_validation_3 + n_validation_3
validation_4 = p_validation_4 + n_validation_4
validation_5 = p_validation_5 + n_validation_5