In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import cuda
from tqdm import tqdm
from transformers import BertConfig
from transformers import BertModel
import json
import matplotlib.pyplot as plt
import os
import pickle

from transformers import AutoTokenizer, T5ForConditionalGeneration

PRETRAINED_MODEL_NAME = "t5-small"
FINETUNED_MODEL_NAME = "t5_20_epochs"
EPOCHS = 10
TRAIN_BATCH_SIZE = 256
VALID_BATCH_SIZE = 256
SEQ_LEN=17

model = T5ForConditionalGeneration.from_pretrained(FINETUNED_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(
    PRETRAINED_MODEL_NAME,
    padding_side="left"
)
tokenizer_r = AutoTokenizer.from_pretrained(
    PRETRAINED_MODEL_NAME,
    padding_side="right"
)

device = 'cuda' if cuda.is_available() else 'cpu'

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [2]:
from torch.utils.data import TensorDataset, DataLoader

def batchify(l, batch_size):
    i = 0
    while i < len(l):
        i += batch_size
        yield l[i-batch_size:i]

def to_tensor_dataset(data, tokenizer, kind):
    
    input_ids = []
    labels = []
    
    outputs = tokenizer_r(["Helyx", "Strand", "Coil"], padding="longest")
    outputs = {i:v for i,v in enumerate(outputs.input_ids)}
    
    labels = data.target.apply(lambda x: outputs[x])#[outputs[row.target] for _,row in tqdm(data.iterrows(), total=len(data))]
    labels = torch.tensor(labels)
    
    print(labels.shape)
        
    batch_size = 200
    
    for sentences in tqdm(batchify(data.Seq, batch_size), total=len(data)//batch_size):
        
        tok_out = tokenizer(sentences.tolist(), add_special_tokens=False, padding="longest")
        tok_out = [
            [x for x in seq if (x != 3) and (x != 0)]
            for seq in tok_out.input_ids
        ]
        tok_out = [
            seq + [tokenizer.pad_token_id]*(SEQ_LEN-len(seq))
            for seq in tok_out
        ]
        
        input_ids += tok_out
    
    input_ids = torch.tensor(input_ids)
    
    print(input_ids.shape)
    
    return TensorDataset(input_ids, labels)

import pickle
import os

if not os.path.exists("train_dataset.pkl"):

    train_dataset = to_tensor_dataset(df, tokenizer, "TRAIN")
    test_dataset = to_tensor_dataset(df_, tokenizer, "TEST")

    with open("train_dataset.pkl", "wb") as f:
        pickle.dump(train_dataset, f)
    with open("test_dataset.pkl", "wb") as f:
        pickle.dump(test_dataset, f)

In [3]:
# with open("train_dataset.pkl", "rb") as f:
#     train_dataset = pickle.load(f)
with open("test_dataset.pkl", "rb") as f:
    test_dataset = pickle.load(f)
test_dataloader = DataLoader(test_dataset, batch_size=300)

In [6]:
from tqdm import tqdm
model.eval()
model.to(device)

all_pred = []
all_gold = []

for input_ids, labels in tqdm(test_dataloader):
    
    with torch.no_grad():
        out = model.generate(
            inputs=input_ids.to(device),
            max_new_tokens=1
        )
    pred = tokenizer.batch_decode(out[:,-1])#out[:,-1].tolist()#tokenizer.batch_decode(out[:,-1])
    gold = tokenizer.batch_decode(labels[:,0])#labels[:,-1].tolist()#tokenizer.batch_decode(labels[:,-1])
    
    all_pred += pred
    all_gold += gold
    

100%|███████████████████████████████████████████████████████████████████████████████| 1636/1636 [01:07<00:00, 24.19it/s]


In [7]:
print("gold labels |", set(all_gold))
print("pred labels |", set(all_pred))

gold labels | {'Co', 'Strand', 'He'}
pred labels | {'Co', 'Strand', 'He'}


In [8]:
pred_dict = {
    "He": 0,
    "Strand": 1,
    "Co": 2,
}
all_pred = [pred_dict[x] for x in all_pred]

gold_dict = pred_dict
all_gold = [gold_dict[x] for x in all_gold]

In [9]:
from sklearn.metrics import classification_report
print(classification_report(all_gold, all_pred, labels=list(set(all_gold))))

              precision    recall  f1-score   support

           0       0.74      0.77      0.75    202335
           1       0.67      0.67      0.67    183014
           2       0.61      0.56      0.58    105263

    accuracy                           0.69    490612
   macro avg       0.67      0.66      0.67    490612
weighted avg       0.68      0.69      0.68    490612



In [14]:
pred_summary_test = pd.DataFrame({"real": all_gold, "pred": all_pred})
pred_summary_test.to_pickle(FINETUNED_MODEL_NAME+"/pred_summary_test.pickle")
pred_summary_test

Unnamed: 0,real,pred
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
490607,1,0
490608,1,0
490609,1,1
490610,1,1


In [1]:
import transformers
import pickle
import torch
model = transformers.T5ForConditionalGeneration.from_pretrained("checkpoint-221600")

EPOCHS = 40
TRAIN_BATCH_SIZE = 256
VALID_BATCH_SIZE = 256
SEQ_LEN=17

with open("train_dataset.pkl", "rb") as f:
    train_dataset = pickle.load(f)
with open("test_dataset.pkl", "rb") as f:
    test_dataset = pickle.load(f)

training_args = transformers.TrainingArguments(
    # ------------------------------------------------------- [epochs and batch size]
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE*2,
    gradient_accumulation_steps=1,
    # ------------------------------------------------------- [hyperparams]
    warmup_steps=100, 
    weight_decay=0.01,
    # ------------------------------------------------------- [save and logging]
    output_dir=".", 
    overwrite_output_dir = True,
    do_eval = False,
    logging_strategy="epoch", # activate if interested
    save_strategy="no",
    save_total_limit = None,
    # -------------------------------------------------------
)
trainer = transformers.Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    data_collator = lambda data: {
        'input_ids': torch.stack([f[0] for f in data]), 
        # 'attention_mask': torch.stack([f[1] for f in data]), 
        'labels': torch.stack([f[1] for f in data]),
    },
    # resume_from_checkpoint=True
)
trainer.train(resume_from_checkpoint=True)
trainer.save_model("t5_20+20_epochs")

Loading model from ./checkpoint-221600.
***** Running training *****
  Num examples = 5672821
  Num Epochs = 40
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 443200
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 20
  Continuing training from global step 221600
  Will skip the first 20 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]

Step,Training Loss



KeyboardInterrupt

