In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, TransfoXLModel, TransfoXLTokenizer, TransfoXLLMHeadModel
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

In [2]:
df = pd.read_csv('../csci-544-project/data/news_summary.csv', encoding='latin-1')
df = df[['text', 'ctext']]
df.columns = ['summary', 'text']
df = df.dropna()
df.head()

Unnamed: 0,summary,text
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [3]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
df_train.shape, df_test.shape

((3956, 2), (440, 2))

In [4]:
class NewsSummaryDataset(Dataset):
    def __init__(self, 
        data: pd.DataFrame, 
        tokenizer: TransfoXLTokenizer, 
        text_max_token_len: int = 512, 
        summary_max_token_len: int = 128
    ):
        self.tokenizer = tokenizer
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        
        text = data_row['text']
        summary = data_row['summary']
        
        text_encoding = tokenizer(
            text, 
            max_length=self.text_max_token_len, 
            padding='max_length', 
            truncation=True, 
            return_attention_mask=True, 
            add_special_tokens=True, 
            return_tensors='pt'
        )
        
        summary_encoding = tokenizer(
            summary, 
            max_length=self.summary_max_token_len, 
            padding='max_length', 
            truncation=True, 
            return_attention_mask=True, 
            add_special_tokens=True, 
            return_tensors='pt'
        )
        
        labels = summary_encoding["input_ids"]
        labels[labels==0] = -100
        
        return dict(
            text=text, 
            summary=summary, 
            text_input_ids=text_encoding['input_ids'].flatten(), 
            text_attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding['attention_mask'].flatten()
        )

In [5]:
class NewsSummaryDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df_train: pd.DataFrame,
        df_test: pd.DataFrame,
        tokenizer: TransfoXLTokenizer,
        batch_size: int = 8,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 128,
    ):
        super().__init__()
        self.df_train = df_train
        self.df_test = df_test
        
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        # self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
        
    def setup(self, stage=None):
        self.train_dataset = NewsSummaryDataset(
            self.df_train,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        
        self.test_dataset = NewsSummaryDataset(
            self.df_test,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=-1
        )
    
    def valid_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=-1
        )

In [6]:
MODEL_NAME = 'transfo-xl-wt103'

tokenizer = TransfoXLTokenizer.from_pretrained(MODEL_NAME)

In [7]:
text_token_counts, summary_token_counts = [], []

for _, row in df_train.iterrows():
    text_token_count = len(tokenizer.encode(row['text']))
    text_token_counts.append(text_token_count)
    
    summary_token_count = len(tokenizer.encode(row['summary']))
    summary_token_counts.append(summary_token_count)

In [8]:
N_EPOCHS = 3
BATCH_SIZE = 4

data_module = NewsSummaryDataModule(df_train, df_test, tokenizer, batch_size=BATCH_SIZE)

In [9]:
class NewsSummaryModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = TransfoXLLMHeadModel.from_pretrained(MODEL_NAME, return_dict=True)
        
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask = attention_mask,
            labels = labels
        )
        
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']
        
        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )
        
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']
        
        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )
        
        self.log('validation_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']
        
        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )
        
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [10]:
model = NewsSummaryModel()

In [11]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [12]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

logger = TensorBoardLogger('lightning_logs', name='transformerXL-news-summary')

trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback],
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [13]:
trainer.fit(model, data_module)

  rank_zero_warn(f"you defined a {step_name} but have no {loader_name}. Skipping {stage} loop")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                 | Params
-----------------------------------------------
0 | model | TransfoXLLMHeadModel | 285 M 
-----------------------------------------------
285 M     Trainable params
0         Non-trainable params
285 M     Total params
1,140.821 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: -1it [00:00, ?it/s]

TypeError: forward() got an unexpected keyword argument 'attention_mask'