In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
import gc
import random

In [27]:
random.seed(100)

In [4]:
def convert_protVec_to_map():
 df = pd.read_csv('data/quarter_raw/protVec_100d_3grams.csv', sep='\t')
 trigrams = df['words']
 trigram_to_idx = {trigram: i for i, trigram in enumerate(trigrams)}
 trigram_vecs = df.loc[:, df.columns != 'words'].values
  
 return trigram_to_idx, trigram_vecs

In [5]:
import re

def read_epitopes():
    path = 'data/cov_epitopes/epitopes.txt'
    epitopes = set()
    with open(path, 'r') as f:
        for line in f:
            bound = re.split(r'-|–', line)
            if len(bound) == 1:
                epitopes.add(int(bound[0]))
            else:
                for i in range(int(bound[0]), int(bound[1])):
                    epitopes.add(i)

    return list(epitopes)

In [6]:
positions = read_epitopes()

In [7]:
len(positions)

149

In [8]:
with open('data/cov_epitopes/epitopes_sorted.txt', 'w') as f:
    for epitope in sorted(positions):
        f.write(f'{epitope}\n')

In [9]:
from src.features import build_features

In [10]:
def create_triplet_trigram_dataset(strains_by_year, trigram_to_idx, epitope_positions, file_name):
  """Creates a dataset in csv format.
  X: Time series of three overlapping trigram vectors, one example for each epitope.
  Y: 0 if epitope does not mutate, 1 if it does.
  """
  triplet_strains_by_year = build_features.make_triplet_strains(strains_by_year, epitope_positions)
  trigrams_by_year = build_features.split_to_trigrams(triplet_strains_by_year)
  trigram_idxs = build_features.map_trigrams_to_idxs(trigrams_by_year, trigram_to_idx)
  labels = build_features.make_triplet_labels(triplet_strains_by_year)

  acc, p, r, f1, mcc = build_features.get_majority_baselines(triplet_strains_by_year, labels)
  with open(file_name + '_baseline.txt', 'w') as f:
    f.write(' Accuracy:\t%.3f\n' % acc)
    f.write(' Precision:\t%.3f\n' % p)
    f.write(' Recall:\t%.3f\n' % r)
    f.write(' F1-score:\t%.3f\n' % f1)
    f.write(' Matthews CC:\t%.3f' % mcc)

  data_dict = {'y': labels}
  for year in range(len(triplet_strains_by_year) - 1):
    data_dict[year] = trigram_idxs[year]

  pd.DataFrame(data_dict).to_csv(file_name + '.csv', index=False)

# Copied as is from Tempel code

In [16]:
def make_triplet_strains(strains_by_year, positions):
  """
  Splits each strain into substrings of 'triplets' refering to 3 overlapping
  trigrams (5 amino acids), centered at the given positions.
  Expects and returns a 2d [year, strain] list of strings.
  """
  triplet_strains_by_year = []
  triplet_strain_margin = 2

  for strains_in_year in strains_by_year:
    triplet_strains_in_year = []
    for strain in strains_in_year:
      for p in positions:
        if p < triplet_strain_margin:
          padding_size = triplet_strain_margin - p
          triplet_strain = '-' * padding_size + strain[:p + triplet_strain_margin + 1]
        elif p > len(strain) - 1 - triplet_strain_margin:
          padding_size = p - (len(strain) - 1 - triplet_strain_margin)
          triplet_strain = strain[p - triplet_strain_margin:] + '-' * padding_size
        else:
          triplet_strain = strain[p - triplet_strain_margin:p + triplet_strain_margin + 1]
        triplet_strains_in_year.append(triplet_strain)
    triplet_strains_by_year.append(triplet_strains_in_year)

  return triplet_strains_by_year

In [None]:
class Trigram:
    """
    A trigram is a sequence of three consecutive amino acids in a strain.
    strain_pos is defined as the position of the first amino acid in the strain.
    """
    def __init__(self, amino_acids, strain_pos):
        self.amino_acids = amino_acids
        self.strain_pos = strain_pos

    def contains_position(self, pos):
        """
        Returns True if one of the amino acids in this trigram is from the
        given pos in the strain.
        """
        return self.strain_pos <= pos and pos < self.strain_pos + len(self.amino_acids)

    # def __repr__(self):
    #     return [self.strain_pos, self.amino_acids]

In [None]:
def split_to_trigrams(strains_by_year, overlapping=True):
  """
  Splits the strains into trigrams, by default overlapping.
  If non-overlapping approach is used, the last amino acids are padded to make
  an extra trigram if the strain length is not evenly divisible by three.
  Expects a 2d [year, strain] list of strings,
  returns a 3d [year, strain, trigram] list of Trigram objects.
  """

  # 2016[ seq1, -, -]
  # 2015[ -, -, -]
  # 2014[ -, -, -]
  # 2013 [ -, -, -]

  # seq1 -> list of trigrams

  if overlapping:
    step_size = 1
    num_of_trigrams = len(strains_by_year[0][0]) - 2
  else:
    step_size = 3
    num_of_trigrams = len(strains_by_year[0][0]) // step_size

  trigrams_by_year = []
  for year_strains in strains_by_year:
    year_trigrams = []

    for strain in year_strains:
      strain_trigrams = []

      for i in range(num_of_trigrams):
        pos = i * step_size
        trigram = Trigram(strain[pos:pos + 3], pos)
        strain_trigrams.append(trigram)

      remainder = len(strain) % step_size
      if remainder > 0:
        padding = '-' * (3 - remainder)
        amino_acids = strain[-remainder:] + padding
        trigram = Trigram(amino_acids, len(strain) - remainder)
        strain_trigrams.append(trigram)

      year_trigrams.append(strain_trigrams)
    
    trigrams_by_year.append(year_trigrams)

  return trigrams_by_year

In [None]:
def map_trigrams_to_idxs(nested_trigram_list, trigram_to_idx):
  """
  Takes a nested list containing Trigram objects and maps them to their index.
  """
  dummy_idx = len(trigram_to_idx)
  
  def mapping(trigram):
    if isinstance(trigram, Trigram):
      trigram.amino_acids = replace_uncertain_amino_acids(trigram.amino_acids)

      if '-' not in trigram.amino_acids:
        return trigram_to_idx[trigram.amino_acids]
      else:
        return dummy_idx

    elif isinstance(trigram, list):
      # print('test')
      return list(map(mapping, trigram))
      
    else:
      raise TypeError('Expected nested list of Trigrams, but encountered {} in recursion.'.format(type(trigram)))
   
  return list(map(mapping, nested_trigram_list))

In [11]:
def sample_strains(strains_by_quarter, num_of_samples):
  """
  Randomly picks num_of_samples strains from each year, 
  sampling is done with replacement.
  Returns a 2d list of strings.
  """
  sampled_strains_by_quarter = []

  for year_strains in strains_by_quarter:
    unique_labels = np.unique(year_strains[:, 1])
    unique_labels_count = np.zeros(len(unique_labels), dtype=np.int64)
    unique_labels_count[:-1] = int(1/len(unique_labels) * num_of_samples)
    unique_labels_count[-1] = num_of_samples - np.sum(unique_labels_count[:-1])
    strains = []
    for idx, count in enumerate(unique_labels_count):
      specific_label = year_strains[year_strains[:, 1] == idx]
      strains.append(random.choices(specific_label, k=int(count)))
    strains = np.vstack(strains)
    sampled_strains_by_quarter.append(strains)

  return sampled_strains_by_quarter

In [None]:
def make_triplet_labels(triplet_strains_by_year):
  """
  Creates labels indicating whether the center amino acid in each triplet 
  mutates in the last year (1 for yes, 0 for no).
  Expects a 2d [year, triplet] list of strings and returns a list of ints.
  """
  num_of_triplets = len(triplet_strains_by_year[0])
  epitope_position = 2

  labels = []
  for i in range(num_of_triplets):
    if triplet_strains_by_year[-1][i][epitope_position] == triplet_strains_by_year[-2][i][epitope_position]:
      labels.append(0)
    else:
      labels.append(1)

  return labels

# Copy over

In [11]:
def replace_uncertain_amino_acids(amino_acids):
  """
  Randomly selects replacements for all uncertain amino acids.
  Expects and returns a string.
  """
  replacements = {'B': 'DN',
                  'J': 'IL',
                  'Z': 'EQ',
                  'X': 'ACDEFGHIKLMNPQRSTVWY',
                  '_': 'ACDEFGHIKLMNPQRSTVWY'}

  for uncertain in replacements.keys():
    amino_acids = amino_acids.replace(uncertain, random.choice(replacements[uncertain]))

  return amino_acids

In [12]:
total_sample = 2000
train_test_split = 0.9

In [13]:
# directory = 'data/quarter/'
directory = 'data/2023 data/'
files = sorted(os.listdir(directory))
strains_by_quarter = []
for file_name in files:
   print(file_name)
   df = pd.read_csv(directory + file_name)
   # df = df[['Sequence', 'label']]
   arr = df.to_numpy()
   arr1 = np.vstack(arr)
   strains_by_quarter.append(arr1)

quarter_1_data.csv
quarter_2_data.csv
quarter_3_data.csv
quarter_4_data.csv


In [14]:
df.head(10)

Unnamed: 0,Accession ID,Collection date,Sequence,Month,Pango lineage,label
0,EPI_ISL_18577063,2023-11-15,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,11,HK.3,1
1,EPI_ISL_18577032,2023-11-12,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,11,EG.5.1,1
2,EPI_ISL_18577045,2023-11-09,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,11,JG.3,1
3,EPI_ISL_18577047,2023-11-09,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,11,JG.3,1
4,EPI_ISL_18801623,2023-12-11,MFVFLVLLPLVSSQCVMPLFNLITTTQSYTNSFTRGVYYPDKVFRS...,12,BA.2.86,1
5,EPI_ISL_18403522,2023-10-01,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,10,DV.7.1,1
6,EPI_ISL_18403590,2023-10-02,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,10,GA.4.1,1
7,EPI_ISL_18403562,2023-10-01,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,10,HK.3,1
8,EPI_ISL_18403650,2023-10-01,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,10,XBB.1.16.6,1
9,EPI_ISL_18419898,2023-10-05,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...,10,XCH.1,1


In [15]:
df.label.dtype

dtype('int64')

In [14]:
strains_by_quarter[0]

array([['EPI_ISL_17080322', '2023-02-13',
        'MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLDVYYHENNKSRMESELRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPVNLGRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSSWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFHEVFNATTFASVYAWNRTRISNCVADYSVLYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIRGNEVSQIAPGQTGNIADYNYKLPDDFTGCVIAWNSNKLDSKVSGNYNYLYRLFRKSKLKPFERDISTEIYQAGNKPCNGVAGFNCYSPLQSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEYVNNSYECDIPIGAGICASYQTQTKSHRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLKRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKYFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVV

In [15]:
strains_by_quarter[0]

array([['EPI_ISL_17080322', '2023-02-13',
        'MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLDVYYHENNKSRMESELRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPVNLGRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSSWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFHEVFNATTFASVYAWNRTRISNCVADYSVLYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIRGNEVSQIAPGQTGNIADYNYKLPDDFTGCVIAWNSNKLDSKVSGNYNYLYRLFRKSKLKPFERDISTEIYQAGNKPCNGVAGFNCYSPLQSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEYVNNSYECDIPIGAGICASYQTQTKSHRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLKRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKYFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVV

In [28]:
i = 2
mask = np.in1d(strains_by_quarter[i][:,0], strains_by_quarter[i+1][:,0])
strains_by_quarter[i][mask]

array([], shape=(0, 6), dtype=object)

In [18]:
str

str

In [16]:
strains_by_quarter[1][:, 1]

array(['2023-04-13', '2023-04-18', '2023-04-24', ..., '2023-04-19',
       '2023-04-19', '2023-04-19'], dtype=object)

In [17]:
train_strains, train_labels = [], []
for quarter_samples in strains_by_quarter:
   train_strains.append(quarter_samples[:, 2])
   train_labels.append(quarter_samples[:, 5])

In [18]:
print(len(train_labels))
print(len(train_strains))

4
4


In [68]:
print(len(train_strains[0]))
print(len(train_labels[0]))

2999
2999


In [19]:
for quarter_strains in train_strains:
   for idx, strain in enumerate(quarter_strains):
      quarter_strains[idx] = replace_uncertain_amino_acids(strain)

In [15]:
# def convert_strains_to_trigram_vec(strains_by_quarter, trigram_idx_map, prot_vec_arr):
#   trigram_vecs_by_quarter = []
#   for idx, strains in enumerate(strains_by_quarter):
#     trigram_vecs_strains = []
#     for strain in strains:
#       prot_vec_form = np.ndarray((len(strain)-2, 100))
#       for i in range(len(strain)-2):
#         trigram = strain[i:i+3]
#         idx = trigram_idx_map[trigram]
#         vec = prot_vec_arr[idx]
#         prot_vec_form[i] = vec
#       trigram_vecs_strains.append(prot_vec_form)
#     trigram_vecs_by_quarter.append(trigram_vecs_strains) 

#   return trigram_vecs_by_quarter

In [20]:
m1, a1 = convert_protVec_to_map()

In [24]:
# csbq, _, _ = create_cluster_by_quarter(train_strains, prot_vec_squeezed, train_labels)

In [25]:
from tqdm.notebook import tqdm

def create_dataset(strains_by_quarter, labels_by_quarter, count):
   start, end = 0, len(strains_by_quarter[0])
   dataset = [[] for _ in range(count)]
   label_ds = [[] for _ in range(count)]

   for row in tqdm(range(count)):
      label_threshold = 0
      for i in range(10):
         strains, labels = strains_by_quarter[i], labels_by_quarter[i]
         sub_labels = np.unique(labels)
         label_choice = random.choice(sub_labels[label_threshold:])
         possible_strains = strains[labels == label_choice]
         possible_labels = labels[labels == label_choice]
         idx = np.random.randint(0, len(possible_labels))
         label = possible_labels[idx]
         sequence = possible_strains[idx]
         dataset[row].append(sequence)
         label_ds[row].append(label)
         label_threshold = label

   dataset = [*zip(*dataset)]
   print(f'dataset: {len(dataset)}x{len(dataset[0])}x{len(dataset[0][0])}')
   # print(f'test strains: {len(test_strains_by_year)}x{len(test_strains_by_year[0])}x{len(test_strains_by_year[0][0])}')

   return dataset, label_ds


In [21]:
from collections import Counter, OrderedDict
from tqdm.notebook import tqdm

def create_dataset_weighted(strains_by_quarter, labels_by_quarter, count):
   start, end = 0, len(strains_by_quarter[0])
   dataset = [[] for _ in range(count)]
   label_ds = [[] for _ in range(count)]
   c = len(strains_by_quarter)
   # print(c)

   for row in tqdm(range(count)):
      label_threshold = 1
      for i in range(c):
         # print(label_threshold)
         strains, labels = strains_by_quarter[i], labels_by_quarter[i]
         # print(len(strains))
         # print(len(labels))
         sub_labels = np.unique(labels)
         # print(labels)
         # print(label_threshold)
         candidate_labels = labels[labels >= label_threshold]
         # print(candidate_labels)
         # print(candidate_labels)
         unordered_candidate_count_map = Counter(candidate_labels)
         # print(unordered_candidate_count_map)
         ordered_candidate_count_map = OrderedDict(sorted(unordered_candidate_count_map.items()))
         # print(ordered_candidate_count_map)
         candidate_count = list(ordered_candidate_count_map.values())
         # print(ordered_candidate_count_map.keys())
         # print(candidate_count)
         # print(len(sub_labels[label_threshold-1:]))
         # print(candidate_count)
         label_choice = random.choices(sub_labels[label_threshold-1:], weights = candidate_count)
         possible_strains = strains[labels == label_choice]
         possible_labels = labels[labels == label_choice]
         # print(possible_labels)
         idx = np.random.randint(0, len(possible_labels))
         label = possible_labels[idx]
         sequence = possible_strains[idx]
         dataset[row].append(sequence)
         label_ds[row].append(label)
         label_threshold = label
      
      # print(label_ds[-1])
      # print('\n')
      # break

   dataset = [*zip(*dataset)]
   print(f'dataset: {len(dataset)}x{len(dataset[0])}x{len(dataset[0][0])}')
   # print(f'test strains: {len(test_strains_by_year)}x{len(test_strains_by_year[0])}x{len(test_strains_by_year[0][0])}')

   return dataset, label_ds


In [29]:
# # train_ds, labels = create_dataset(train_strains, train_labels, count=800)
# # val_ds, _ = create_dataset(train_strains, train_labels, count=100)
# test_ds, _ = create_dataset(train_strains, train_labels, count=1000)


  0%|          | 0/1500 [00:00<?, ?it/s]

TypeError: slice indices must be integers or None or have an __index__ method

In [23]:
k = 4

In [71]:
print(i, k+i)

2 6


In [70]:
# train_ds, train_labels = create_dataset_weighted(train_strains[-k:], train_labels[i:k+i], count=1500)
# val_ds, _ = create_dataset_weighted(train_strains[-k:], train_labels[i:k+i], count=300)
# test_ds, test_labels = create_dataset_weighted(train_strains[-k:], train_labels[-k:], count=300)


  0%|          | 0/1500 [00:00<?, ?it/s]

2999
2940


IndexError: boolean index did not match indexed array along dimension 0; dimension is 2999 but corresponding boolean dimension is 2940

In [30]:
# train_ds, labels = create_dataset_weighted(train_strains[-k:], train_labels[-k:], count=1500)
val_ds, val_labels = create_dataset_weighted(train_strains[-k:], train_labels[-k:], count=100)
# test_ds, test_labels = create_dataset_weighted(train_strains[-k:], train_labels[-k:], count=300)

  0%|          | 0/100 [00:00<?, ?it/s]

dataset: 4x100x1280


In [105]:
labels

[[1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1,

In [106]:
q2_seq = test_ds[-2]

In [107]:
q2_seq[0]

'MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPALPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLDVYQKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKEGNFKNLREFVFKNIDGYFKIYSKHTPINLERDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPVDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIRGNEVSQIAPGQTGNIADYNYKLPDDFTGCVIAWNSNKLDSKPSGNYNYLYRLLRKSKLKPFERDISTEIYQAGNRPCNGVAGPNCYSPLQSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEYVNNSYECDIPIGAGICASYQTQTKSHRRARSVASQSIIAYTMSLGVENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLKRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKYFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNHNAQALNTLVKQLSSKFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQS

In [25]:
with open('data/independent_set/sequences.txt', 'r') as f:
    for seq1, seq2 in zip(f, q2_seq):
        if seq1 != seq2:
            print('not same')

In [27]:
with open('data/independent_set/eve_test_seqs.txt', 'w') as f:
    for idx, seq in enumerate(q2_seq):
        f.write(f'>Reference_Sequence_{idx}\n')
        f.write(seq)
        f.write('\n')

In [43]:
with open('data/independent_set/sequences.txt', 'w') as f:
    for line in q2_seq:
        f.write(f"{line}\n")

In [48]:
labels_np = np.asarray(labels)

In [None]:
with open('data/independent_set/previous_labels.txt', 'w') as f:
    for label in labels_np[:, -2]:
        f.write(f"{label}\n")

In [49]:
print(labels_np[labels_np[:, -2] == 0].shape)
print(labels_np[labels_np[:, -2] == 1].shape)
print(labels_np[labels_np[:, -2] == 2].shape)

(0, 7)
(58, 7)
(942, 7)


In [27]:
dir = f'data/improvement_set/{i+1}/'

if not os.path.exists(dir):
    os.makedirs(dir)

create_triplet_trigram_dataset(train_ds, m1, positions, dir+'cov_train')
create_triplet_trigram_dataset(val_ds, m1, positions, dir+'cov_val')
# create_triplet_trigram_dataset(test_ds, m1, positions, dir+'cov_test')

In [29]:
dir = f'data/2023_set/'

if not os.path.exists(dir):
    os.makedirs(dir)

# create_triplet_trigram_dataset(train_ds, m1, positions, dir+'cov_train')
create_triplet_trigram_dataset(val_ds, m1, positions, dir+'cov_val')
# create_triplet_trigram_dataset(test_ds, m1, positions, dir+'cov_test')

In [25]:
k = 10

test_dirs = f'data/test_covs/{k-1}/'

if not os.path.exists(test_dirs):
    os.makedirs(test_dirs)

for i in range(5):
    random.seed(i*100)
    test_ds, _ = create_dataset_weighted(train_strains[-k:], train_labels[-k:], count=200)
    create_triplet_trigram_dataset(test_ds, m1, positions, test_dirs+f'{i}')

  0%|          | 0/200 [00:00<?, ?it/s]

dataset: 10x200x1280


  0%|          | 0/200 [00:00<?, ?it/s]

dataset: 10x200x1280


  0%|          | 0/200 [00:00<?, ?it/s]

dataset: 10x200x1280


  0%|          | 0/200 [00:00<?, ?it/s]

dataset: 10x200x1280


  0%|          | 0/200 [00:00<?, ?it/s]

dataset: 10x200x1280


In [26]:
import pandas as pd

df = pd.read_csv('data/independent_set/cov_test.csv')
df.shape

(149000, 7)

In [27]:
df.groupby(['y']).size()

y
0    71787
1    77213
dtype: int64

In [21]:
import pandas as pd

df = pd.read_csv('data/quarter/year_2022_0.csv')
df.shape

(41540, 10)

In [22]:
df.groupby(['label']).size()

label
0      625
1     2660
2    38255
dtype: int64