In [5]:
from typing import List, Dict
import copy
from tqdm.notebook import tqdm_notebook
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, early_stopping
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
import time
import os
import re 

from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
    )

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from transformers import T5ForConditionalGeneration, T5Config


In [None]:
file_path = '/home/qiyu/Dev/ziqing/T5/doubled_newcombined_squad.csv'
data = pd.read_csv(file_path)

train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df.to_csv('double_newcombined_squad_train.csv', index=False)

val_df.to_csv('double_newcombined_squad_val.csv', index=False)

test_df.to_csv('double_newcombined_squad_test.csv', index=False)


In [None]:
squad_train_1 = pd.read_csv('/home/qiyu/Dev/ziqing/T5/train/newcombined_squad_train.csv')
squad_val_1 = pd.read_csv('/home/qiyu/Dev/ziqing/T5/train/newcombined_squad_val.csv')
squad_test_1 = pd.read_csv('/home/qiyu/Dev/ziqing/T5/train/newcombined_squad_test.csv')

squad_train_1.head()
squad_val_1.head()
squad_test_1.head()

train_1 = squad_train_1
val_1 = squad_val_1
test_1 = squad_test_1

print('train_1_df:', train_1.shape,)
print('val_1_df:', val_1.shape,)
print('test_1_df:', test_1.shape,)

train_1.head()

SPECIAL_TOKENS = ['<sep>','<space>']
MASKING_CHANCE = 0.3 
#tokenizer = AutoTokenizer.from_pretrained("t5-small")
#model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

SOURCE_MAX_TOKEN_LEN = 200
TARGET_MAX_TOKEN_LEN = 45

N_EPOCHS = 50
BATCH_SIZE = 128
LEARNING_RATE = 0.0001
DF_TAKE_PERCENTAGE = 1

In [3]:

# class QGDataset(Dataset):

#     def __init__(self, tokenizer, file_path, max_len_input=512, max_len_output=128):
#         self.tokenizer = tokenizer
#         self.data = pd.read_csv(file_path)
#         self.max_len_input = max_len_input
#         self.max_len_output = max_len_output
#         self.context_column = 'text'
#         self.topic = 'topic'
#         self.question_column = 'question'
#         self.inputs = []
#         self.targets = []
#         self._load_data()

#     def __len__(self):
#         return len(self.inputs)

#     def __getitem__(self, index):
#         source_ids = self.inputs[index]['input_ids'].squeeze()
#         target_ids = self.targets[index]['input_ids'].squeeze()
#         source_mask = self.inputs[index]['attention_mask'].squeeze()
#         target_mask = self.targets[index]['attention_mask'].squeeze()
#         labels = copy.deepcopy(target_ids)
#         labels[labels == 0] = -100
#         return {'source_ids': source_ids, 'source_mask': source_mask, 'target_ids': target_ids, 'target_mask': target_mask, 'labels': labels}

#     def _load_data(self):
#         for idx in tqdm_notebook(range(len(self.data))):

#             context, topic, target = self.data.loc[idx, self.context_column], self.data.loc[idx, self.topic], self.data.loc[idx, self.question_column]
#             # if len(str(answer).split()) >= 8:
#             #     input_text = '<longanswer> %s <context> %s ' % (answer, context)
#             # else:
#             #     input_text = '<answer> %s <context> %s ' % (answer, context)
#             input_text = '<topic> %s <context> %s ' % (topic, context)
#             target = str(target)

#             tokenized_inputs = self.tokenizer.batch_encode_plus(
#                 [input_text],
#                 max_length=self.max_len_input,
#                 padding='max_length',
#                 truncation=True,
#                 return_tensors='pt'
#             )

#             tokenized_targets = self.tokenizer.batch_encode_plus(
#                 [target],
#                 max_length=self.max_len_output,
#                 padding='max_length',
#                 truncation=True,
#                 return_tensors='pt'
#             )

#             self.inputs.append(tokenized_inputs)
#             self.targets.append(tokenized_targets)


In [4]:
class QGDataset(Dataset):

    def __init__(self, tokenizer, file_path, max_len_input=512, max_len_output=128):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(file_path)
        self.max_len_input = max_len_input
        self.max_len_output = max_len_output
        self.context_column = 'text'
        self.topic = 'topic'
        self.question_column = 'question'
        self.inputs = []
        self.targets = []
        self._load_data()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]['input_ids'].squeeze()
        target_ids = self.targets[index]['input_ids'].squeeze()
        source_mask = self.inputs[index]['attention_mask'].squeeze()
        target_mask = self.targets[index]['attention_mask'].squeeze()
        labels = copy.deepcopy(target_ids)
        labels[labels == 0] = -100
        return {'source_ids': source_ids, 'source_mask': source_mask, 'target_ids': target_ids, 'target_mask': target_mask, 'labels': labels}

    def _load_data(self):
        for idx in tqdm_notebook(range(len(self.data))):

            context, topic, target = self.data.loc[idx, self.context_column], self.data.loc[idx, self.topic], self.data.loc[idx, self.question_column]
            # if len(str(answer).split()) >= 8:
            #     input_text = '<longanswer> %s <context> %s ' % (answer, context)
            # else:
            #     input_text = '<answer> %s <context> %s ' % (answer, context)
            input_text = f'{topic}<sep>{context}'
            target = str(target)

            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_text],
                max_length=self.max_len_input,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target],
                max_length=self.max_len_output,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)


In [None]:
# start_time = time.time()
# pl.seed_everything(99)

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# print('Using device:', device)
# print('Loading pre-trained model...')

# model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
# tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
# tokenizer.add_special_tokens(
#         {'additional_special_tokens':  ['<sep>','<space>']}
#     )


In [None]:
start_time = time.time()
pl.seed_everything(99)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Using device:', device)
print('Loading pre-trained model...')

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")


In [None]:
print('Preparing dataset...')
train_dataset = QGDataset(tokenizer, '/home/qiyu/Dev/ziqing/T5/train/newcombined_squad_train.csv')
validation_dataset = QGDataset(tokenizer, '/home/qiyu/Dev/ziqing/T5/train/newcombined_squad_val.csv')

print('train_dataset: ', len(train_dataset))
print('validation_dataset: ', len(validation_dataset))

print ('Initializing model...')

In [None]:
SPECIAL_TOKENS = ['<sep>','<space>']
tokenizer.add_tokens(SPECIAL_TOKENS)

In [7]:
checkpoint_path = "/home/qiyu/Dev/ziqing/T5/checkpoint_epoch_69.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)

In [None]:
model.resize_token_embeddings(len(tokenizer)-2)

model.load_state_dict(checkpoint['model_state_dict'])

model.to(device)

print("Model loaded and ready for fine-tuning.")

In [9]:
class T5FineTuner(pl.LightningModule):

    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        #self.args = args

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels, # decoder_input_ids included in lm_labels
        )

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['source_ids'],
            attention_mask=batch['source_mask'],
            # decoder_input_ids=batch['target_ids'],
            # decoder_attention_mask=batch['target_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        # logits = outputs.logits
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch['source_ids'],
            attention_mask=batch['source_mask'],
            # decoder_input_ids=batch['target_ids'],
            # decoder_attention_mask=batch['target_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        # logits = outputs.logits
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=64, num_workers=1)

    def val_dataloader(self):
        return DataLoader(validation_dataset, batch_size=64, num_workers=1)

    def configure_optimizers(self):
        # no_decay = ["bias", "LayerNorm.weight"]
        # optimizer_grouped_parameters = [
        #     {
        #         "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
        #         "weight_decay": self.args.weight_decay,
        #     },
        #     {
        #         "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
        #         "weight_decay": 0.0,
        #     },
        # ]
        # return AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=args.eps)
        return AdamW(self.parameters(), lr=0.001, eps=1e-08)

In [10]:
model = T5FineTuner(model, tokenizer)

In [11]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping


In [None]:
trainer = pl.Trainer(
        max_epochs=50,
        
        # gradient_clip_val=1.0,
        # auto_lr_find=True,
        callbacks=[EarlyStopping(monitor="val_loss")]
    )

In [13]:
save_model_path = '/home/qiyu/Dev/ziqing/T5/T5_Q8_newcombinedsquad_once'
save_tokenizer_path = '/home/qiyu/Dev/ziqing/T5/T5_Q8_newcombinedsquad_once'

In [None]:
print('Fine tuning...')
trainer.fit(model)
    # trainer.test()

print('Saving model...')
if not os.path.exists(save_model_path):
    os.makedirs(save_model_path)
if not os.path.exists(save_tokenizer_path):
    os.makedirs(save_tokenizer_path)
model.model.save_pretrained(save_model_path)
tokenizer.save_pretrained(save_tokenizer_path)

end_time = time.time() - start_time
print('Total time: %s hours' % (end_time / 60 / 60))
print('All done.')

In [None]:
squad_train_2 = pd.read_csv('/home/qiyu/Dev/ziqing/T5/previous_squad/squad_train_2.csv')
squad_val_2 = pd.read_csv('/home/qiyu/Dev/ziqing/T5/previous_squad/squad_val_2.csv')
squad_test_2 = pd.read_csv('/home/qiyu/Dev/ziqing/T5/previous_squad/squad_test_2.csv')

squad_train_2.head()
squad_val_2.head()
squad_test_2.head()

train_2 = squad_train_2
val_2 = squad_val_2
test_2 = squad_test_2

print('train_2_df:', train_2.shape,)
print('val_2_df:', val_2.shape,)
print('test_2_df:', test_2.shape,)

train_2.head()

SPECIAL_TOKENS = ['<sep>','<space>']
MASKING_CHANCE = 0.3 #30% chance to replace the answer with '[MASK]'

#tokenizer = AutoTokenizer.from_pretrained("t5-small")
#model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

SOURCE_MAX_TOKEN_LEN = 200
TARGET_MAX_TOKEN_LEN = 45

N_EPOCHS = 50
BATCH_SIZE = 128
LEARNING_RATE = 0.0001
DF_TAKE_PERCENTAGE = 1

In [None]:
print('Preparing dataset...')
train_dataset = QGDataset(tokenizer, '/home/qiyu/Dev/ziqing/T5/previous_squad/squad_train_2.csv')
validation_dataset = QGDataset(tokenizer, '/home/qiyu/Dev/ziqing/T5/previous_squad/squad_val_2.csv')

print('train_dataset: ', len(train_dataset))
print('validation_dataset: ', len(validation_dataset))

print ('Initializing model...')

In [None]:
start_time = time.time()
pl.seed_everything(99)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Using device:', device)
print('Loading pre-trained model...')

model = T5ForConditionalGeneration.from_pretrained("/home/qiyu/Dev/ziqing/T5/T5stem_uniquesquad46_phase1")
tokenizer = T5Tokenizer.from_pretrained("/home/qiyu/Dev/ziqing/T5/T5stem_uniquesquad46_phase1")
tokenizer.add_special_tokens(
        {'additional_special_tokens':  ['<sep>','<space>']}
    )


In [None]:
model = T5FineTuner(model, tokenizer)

In [None]:
trainer = pl.Trainer(
        max_epochs=50,
        
        # gradient_clip_val=1.0,
        # auto_lr_find=True,
        callbacks=[EarlyStopping(monitor="val_loss")]
    )

In [None]:
save_model_path = '/home/qiyu/Dev/ziqing/T5/T5stem_uniquesquad46_phase2'
save_tokenizer_path = '/home/qiyu/Dev/ziqing/T5/T5stem_uniquesquad46_phase2'

In [None]:
print('Fine tuning...')
trainer.fit(model)
    # trainer.test()

print('Saving model...')
if not os.path.exists(save_model_path):
    os.makedirs(save_model_path)
if not os.path.exists(save_tokenizer_path):
    os.makedirs(save_tokenizer_path)
model.model.save_pretrained(save_model_path)
tokenizer.save_pretrained(save_tokenizer_path)

end_time = time.time() - start_time
print('Total time: %s hours' % (end_time / 60 / 60))
print('All done.')