# Installs

In [1]:
!pip install pytorch_lightning
!pip install transformers
!pip install sentencepiece



# Imports

In [2]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import os
import time
import random
import argparse
import glob
import json
import logging
import re
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics
from itertools import chain
from string import punctuation
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Seed

In [3]:
# Set random seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Load data

In [4]:
from google.colab import drive
drive.mount('/content/drive')
from os.path import join as pjoin
base_dir = 'drive/MyDrive/-CS 505/Final Project'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data_dir = pjoin(base_dir, 'data/2017_English_final/GOLD/Subtask_A/')
# data_dir = '../data/2017_English_final/GOLD/Subtask_A/'

train_files = []
val_files = []
test_files = []

for root, dirs, files in os.walk(data_dir):
    for file_name in files:
        if 'train' in file_name and '.txt' in file_name:
            train_files.append(os.path.join(data_dir, file_name))
        if 'dev' in file_name and '.txt' in file_name:
            val_files.append(os.path.join(data_dir, file_name))
        if 'test' in file_name and '.txt' in file_name:
            test_files.append(os.path.join(data_dir, file_name))
        
train_data = []
train_labels = []
val_data = []
val_labels = []

sentiment_to_label = {'positive': 2, 'neutral': 1, 'negative': 0}

for file_path in train_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for l in lines:
            entries = l.split('\t')
            train_data.append(entries[2])
            train_labels.append(sentiment_to_label[entries[1]])
            
    
for file_path in val_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for l in lines:
            entries = l.split('\t')
            val_data.append(entries[2])
            val_labels.append(sentiment_to_label[entries[1]])
            

train_data = np.array(train_data)
train_labels = np.array(train_labels)
val_data = np.array(val_data)
val_labels = np.array(val_labels)
len(train_data)

16173

In [6]:
train_pos = []
train_neu = []
train_neg = []

for i in range(len(train_data)):
    if (train_labels[i] == 0):
        train_neg.append(train_data[i])
    elif (train_labels[i] == 1):
        train_neu.append(train_data[i])
    elif (train_labels[i] == 2):
        train_pos.append(train_data[i])
        
val_pos = []
val_neu = []
val_neg = []

for i in range(len(val_data)):
    if (val_labels[i] == 0):
        val_neg.append(val_data[i])
    elif (val_labels[i] == 1):
        val_neu.append(val_data[i])
    elif (val_labels[i] == 2):
        val_pos.append(val_data[i])

# Model for fine tuning

In [7]:
from transformers.utils.dummy_pt_objects import AutoModelForTableQuestionAnswering
from transformers.models import auto
class MT5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(MT5FineTuner, self).__init__()
    self.hparams.update(vars(hparams))
    
    self.model = MT5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.global_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [8]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

# Parameters

In [9]:
args_dict = dict(
    data_dir=data_dir, # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='google/mt5-base',
    tokenizer_name_or_path='google/mt5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=32,
    eval_batch_size=32,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

# Tokenize data

In [10]:
tokenizer = T5Tokenizer.from_pretrained('google/mt5-base')

In [11]:
ids_neg = tokenizer.encode('negative </s>')
ids_neu = tokenizer.encode('neutral </s>')
ids_pos = tokenizer.encode('positive </s>')
len(ids_neg), len(ids_pos), len(ids_neu)

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


(3, 2, 2)

In [12]:
class TweetDataset(Dataset):
  def __init__(self, tokenizer, pos, neu, neg, max_len=512):
    self.pos_files = pos
    self.neu_files = neu
    self.neg_files = neg
    
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()
  
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    self._buil_examples_from_files(self.pos_files, 'positive')
    self._buil_examples_from_files(self.neu_files, 'neutral')
    self._buil_examples_from_files(self.neg_files, 'negative')
  
  def _buil_examples_from_files(self, lines, sentiment):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

    for text in lines:
      
      line = text.strip()
      line = REPLACE_NO_SPACE.sub("", line) 
      line = REPLACE_WITH_SPACE.sub("", line)
      line = line + ' </s>'

      target = sentiment + " </s>"

       # tokenize inputs
      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [line], max_length=self.max_len, padding='max_length', return_tensors="pt"
      )
       # tokenize targets
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=2, padding='max_length', return_tensors="pt"
      )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [13]:
dataset = TweetDataset(tokenizer, val_pos, val_neu, val_neg,  max_len=512)
len(dataset)

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


5653

In [14]:
def get_dataset(tokenizer, type_path, args):
    if type_path == "val":
        return TweetDataset(tokenizer, val_pos, val_neu, val_neg, max_len=512)
    if type_path == "train":
        return TweetDataset(tokenizer, train_pos, train_neu, train_neg, max_len=512)

# Training parameters

In [15]:
args_dict.update({'output_dir': 'mt5_tweet_sentiment', 'num_train_epochs':2})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    amp_backend='apex',
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    enable_checkpointing=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

# Train model

In [16]:
#### Create model
model = MT5FineTuner(args)

In [17]:
trainer = pl.Trainer(**train_params)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [18]:
trainer.fit(model)

Missing logger folder: /content/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                        | Params
------------------------------------------------------
0 | model | MT5ForConditionalGeneration | 582 M 
------------------------------------------------------
582 M     Trainable params
0         Non-trainable params
582 M     Total params
2,329.605 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  cpuset_checked))


Training: 0it [00:00, ?it/s]

RuntimeError: ignored

In [None]:
model.model.save_pretrained('mt5_base_tweet_sentiment')

# Evaluate