In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import sys
sys.path.append('..')
from data.dataloader import AminoAcidTokenizer, ProteinDataset


In [39]:
VALIDATION_RATIO = 0.2
TEST_RATIO = 0.15

RANDOM_STATE = 42

DATA_PATH = '../dataset/prot-300-filtered.csv'

MAX_SEQ_LEN = 200

In [40]:
df = pd.read_csv(DATA_PATH)
df

Unnamed: 0,seq_id,seq,seq_length
0,sp|A0A009IHW8|ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,269
1,sp|A0A024SC78|CUTI1_HYPJR,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,248
2,sp|A0A067CMC7|HTP3_SAPPC,MLEVPVWIPILAFAVGLGLGLLIPHLQKPFQRFSTVNDIPKEFFEH...,211
3,sp|A0A067XR63|XTH7_DIOKA,MNAEGGNLHREFEITWGDGRARIHNNGGLLTLSLDRASGSGFRSKN...,268
4,sp|A0A067XRK9|XTH6_DIOKA,MASSLTLPMAMAFTLLALSFASAMGGSMNSSRFDELFQPSWAFDHF...,299
...,...,...,...
290016,sp|V5QPS4|3098B_MYCTU,MTATIGFRPTEKDEQIINAAMRSGERKSDVIRRALQLLEREVWIKQ...,66
290017,sp|V9HVX0|YPAA_ECOLI,MTIAERLRQEGHQIGWQEGKLEGLHEQAIKIALRMLEQGFDRDQVL...,61
290018,sp|W5X2N3|AP_FRAAN,MGWVWKDDDEQGGHVNPSAADISPRLDGDRCSTRKVVRTQCKTEEV...,193
290019,sp|W6JHZ8|VP1_CPBDV,MNVKGASDKAQLAMQAEWEEVLAPEGALAVEEASSVLKISDDERRS...,210


In [41]:
# filter df by sequence length
df = df[df.seq_length <= MAX_SEQ_LEN]
df

Unnamed: 0,seq_id,seq,seq_length
6,sp|A0A068B6Q6|CA18_CONBE,PDGRNAAAKAFDLITPTVRKGCCSNPACILNNPNQCG,37
9,sp|A0A088MIT0|BRKP2_PHYNA,MAFLKKSLFLVLFLGVVSLSFCEEEKREEHEEEKRDEEDAESLGKR...,134
10,sp|A0A097PTA8|DEFCO_COPCI,MKLSTSLLAIVAVASTFIGNALSATTVPGCFAECIDKAAVAVNCAA...,184
11,sp|A0A0A0LLY1|SRBP1_CUCSA,MASSSVEFRCFVGGLAWATDSNSLEKAFSVYGEIVEAKIVSDRETG...,160
13,sp|A0A0A1I6E7|NDB4S_ANDCR,MEIKYLLTVFLVLLIVSDHCQAFLFSLIPHAISGLISAFKGRRKRD...,74
...,...,...,...
290013,sp|Q9ZZX9|Q0010_YEAST,MYYIMFLYNMLLIIILIFYSIVGVPIIIFNNNYYWDPDIFLFIIYY...,128
290015,sp|U3H040|YBS9_SCHPO,MTGPFRYNGGSVRSFALTTNFSFPSYDLSFNETEHGVFCYVSRPLT...,128
290016,sp|V5QPS4|3098B_MYCTU,MTATIGFRPTEKDEQIINAAMRSGERKSDVIRRALQLLEREVWIKQ...,66
290017,sp|V9HVX0|YPAA_ECOLI,MTIAERLRQEGHQIGWQEGKLEGLHEQAIKIALRMLEQGFDRDQVL...,61


In [42]:
train_val_df, test_df = train_test_split(df, test_size=TEST_RATIO, random_state=RANDOM_STATE)
train_df, val_df = train_test_split(train_val_df, test_size=VALIDATION_RATIO, random_state=RANDOM_STATE)
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

Train size: 121135
Validation size: 30284
Test size: 26721


In [43]:
def count_seq_less_than(df, seq_len):
    return len(df[df['seq_length'] <= seq_len])

print(f"Train seq len <= 100: {count_seq_less_than(train_df, 100)}")
print(f"Validation seq len <= 100: {count_seq_less_than(val_df, 100)}")
print(f"Test seq len <= 100: {count_seq_less_than(test_df, 100)}")

Train seq len <= 100: 39030
Validation seq len <= 100: 9678
Test seq len <= 100: 8547


In [44]:
train_df.to_csv('../dataset/splitted/train.csv', index=False)
val_df.to_csv('../dataset/splitted/val.csv', index=False)
test_df.to_csv('../dataset/splitted/test.csv', index=False)

### Get deterministic masks for valid/test

In [55]:
max_seq_len = 100

In [56]:
# Convert DataFrames to lists of sequences
train_seqs = train_df['seq'].tolist()
val_seqs = val_df['seq'].tolist()
test_seqs = test_df['seq'].tolist()

In [60]:
import torch
import os

def preprocess_and_mask(sequences, tokenizer, max_seq_len, mask_probability=0.15):
    proten_dataset = ProteinDataset(None, tokenizer, max_seq_len)

    inputs, labels, attention_masks = [], [], []
    
    for seq in sequences:
        encoded_seq, attn_mask = tokenizer.encode(seq)
        input_ids, label_ids = proten_dataset.random_masking(encoded_seq, mask_probability)
        inputs.append(input_ids)
        labels.append(label_ids)
        attention_masks.append(attn_mask)
    
    return torch.tensor(inputs, dtype=torch.long), torch.tensor(labels, dtype=torch.long), torch.tensor(attention_masks, dtype=torch.long)


def process_and_store_data(sequences, max_seq_len, mask_probability=0.15, prefix='data'):
    # Initialize tokenizer with the specified max_seq_len
    tokenizer = AminoAcidTokenizer(max_seq_length=max_seq_len)

    # Process and mask the sequences
    inputs, labels, masks = preprocess_and_mask(sequences, tokenizer, max_seq_len, mask_probability)

    # Store the processed data
    parent_dir = f'../dataset/splitted/max_seq{max_seq_len}/'
    # Create the directory if it doesn't exist
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)
    torch.save(inputs, f'{parent_dir}/{prefix}_inputs_{max_seq_len}.pt')
    torch.save(labels, f'{parent_dir}/{prefix}_labels_{max_seq_len}.pt')
    torch.save(masks, f'{parent_dir}/{prefix}_masks_{max_seq_len}.pt')
    
    return inputs, labels, masks


In [67]:
# Process, mask, and store data for max_seq_len of 100 and 200
process_and_store_data(val_seqs, 100, mask_probability=0.15, prefix='val')
process_and_store_data(test_seqs, 100, mask_probability=0.15, prefix='test')

process_and_store_data(val_seqs, 200, mask_probability=0.15, prefix='val')
process_and_store_data(test_seqs, 200, mask_probability=0.15, prefix='test')

(tensor([[ 1, 14, 19,  ...,  0,  0,  0],
         [ 1, 14, 17,  ...,  0,  0,  0],
         [ 1, 14, 13,  ...,  0,  0,  0],
         ...,
         [ 1, 14,  3,  ...,  0,  0,  0],
         [ 1,  3, 15,  ...,  0,  0,  0],
         [ 1, 14,  7,  ...,  0,  0,  0]]),
 tensor([[ 1, 14, 19,  ...,  0,  0,  0],
         [ 1, 14, 17,  ...,  0,  0,  0],
         [ 1, 14, 13,  ...,  0,  0,  0],
         ...,
         [ 1, 14,  4,  ...,  0,  0,  0],
         [ 1, 14, 15,  ...,  0,  0,  0],
         [ 1, 14,  7,  ...,  0,  0,  0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]))