In [None]:
# To mount our Google Drive folder if we are using Google Colab
from google.colab import drive
drive.mount('/content/drive')

# Installation Steps

In [None]:
#Installing PyTorch
!pip3 install --quiet torch torchvision torchaudio

#Install HuggingFace Transformer
!pip install --quiet transformers

#Install tqdm for progress bars
!pip install --quiet tqdm

#Essential for automodels where models are not directly from huggingface pre-trained libs
!pip install --quiet sentencepiece

#Install pytorch lightning
!pip install --quiet git+https://github.com/PyTorchLightning/pytorch-lightning

#Install tqdm for progress bar
!pip install --quiet tqdm

#Install pandas
!pip install --quiet pandas

#Install sklearn
!pip install --quiet sklearn

#Install termcolor
!pip install --quiet termcolor

#Install tensorflow
!pip install --quiet tensorflow

## Specially for Jupyterhub

In [None]:
# for JupyterHub
!pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html


Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [None]:
!pip install --quiet pytorch-lightning==1.1.3

In [None]:
!pip install --quiet --upgrade pytorch-lightning
!pip install --quiet --upgrade pip


# Verify Installations

In [None]:
!sudo apt install python3

In [None]:
#To test if HuggingFace is properly installed
!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

# Load T5X Model

Initialize and retrieve model for training

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, AdamW

import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from termcolor import colored
from tqdm import tqdm


checkpoint = "google/t5-v1_1-large" #"google/t5-v1_1-xl"  #"google/t5-v1_1-xl" is too huge for google colab to handle

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


  from .autonotebook import tqdm as notebook_tqdm


# Further Data Preprocessing

## Fetch data from filtered csvs

In [None]:
trainPath = "./filtered_data/train.csv"
devPath = "./filtered_data/dev.csv"
testPath = "./filtered_data/test.csv"

# For Google Colab
# trainPath = "./drive/MyDrive/filtered_data/train.csv"
# devPath = "./drive/MyDrive/filtered_data/dev.csv"
# testPath = "./drive/MyDrive/filtered_data/test.csv"

dfTrain = pd.read_csv(trainPath)
dfDev = pd.read_csv(devPath)
dfTest = pd.read_csv(testPath)

dfTrain.head()

Unnamed: 0,claim,justification,questions
0,"""Unemployment is low because everyone has two ...","Ocasio-Cortez said, ""Unemployment is low becau...",['Can low unemployment rates be attributed to ...
1,Says the impeachment of President Donald Trump...,"Gigot said if Trump is impeached, this would b...",['Have any presidents who have been impeached ...
2,"""When I withdrew in June of 2008, polls were s...","Clinton said, ""When I withdrew in June of 2008...",['Did Obama lose this level of potential suppo...
3,Text from someone “connected” to Raleigh polic...,A text message circulating through the Raleigh...,['Did this information come from the police of...
4,"“Some states, like Montana and Nebraska, are g...","Cuomo tweeted, ""Some states, like Montana and ...","['Are some states getting $300,000 in federal ..."


## Optional code to verify dataframe

In [None]:
#Questions are of type str
print(type(dfTrain.loc[0, 'questions']))
dfTrain.loc[0, 'questions']

<class 'str'>


"['Can low unemployment rates be attributed to everyone having 2 jobs?', 'Is unemployment even currently low because of factors stated by Ocasio Cortez?', 'Can low unemployment be attributed to long work hours?', 'Do most people in this country work two jobs?', 'Do most people in this country work up to 80 hours a week?', 'Have rates for either of these factors reached a historic high?', 'Do government organization use either of these factors in evaluating unemployment?']"

## Preprocess Question strings

In [None]:
# Remove starting [' and ending '] and add custom divider between sub-questions
def preprocessQuestions(df, divider, toReplace):
  for i in tqdm(range(len(df))):
    qsnStr = df[i]
    qsnStr = qsnStr[2:-2]
    df[i] = qsnStr.replace(toReplace, divider)
    

qDivider = " "
toReplace = "', '"

print("\nProcessing train df questions")
preprocessQuestions(dfTrain.loc[ : , "questions"], qDivider, toReplace)
print("\nProcessing dev df questions")
preprocessQuestions(dfDev.loc[ : , "questions"], qDivider, toReplace)
print("\nProcessing test df questions")
preprocessQuestions(dfTest.loc[ : , "questions"], qDivider, toReplace)



Processing train df questions


100%|██████████| 793/793 [00:00<00:00, 11830.87it/s]



Processing dev df questions


100%|██████████| 197/197 [00:00<00:00, 12751.79it/s]



Processing test df questions


100%|██████████| 200/200 [00:00<00:00, 12886.52it/s]


In [None]:
# Prepend prefix to claims
def addPrefix(df, prefix):
  for i in tqdm(range(len(df))):
    df[i] = prefix + df[i]
    
prefix = "Generate Questions: "

print("\nAdding prefix for train df questions")
addPrefix(dfTrain.loc[ : , "claim"], prefix)
print("\nAdding prefix for dev df questions")
addPrefix(dfDev.loc[ : , "claim"], prefix)
print("\nAdding prefix for test df questions")
addPrefix(dfTest.loc[ : , "claim"], prefix)



Adding prefix for train df questions


100%|██████████| 793/793 [00:00<00:00, 11959.34it/s]



Adding prefix for dev df questions


100%|██████████| 197/197 [00:00<00:00, 13394.90it/s]



Adding prefix for test df questions


100%|██████████| 200/200 [00:00<00:00, 13713.82it/s]


In [None]:
# Prepend prefix to justifications
jPrefix = "Justification: "

print("\nAdding prefix for train df justification")
addPrefix(dfTrain.loc[ : , "justification"], jPrefix)
print("\nAdding prefix for dev df justification")
addPrefix(dfDev.loc[ : , "justification"], jPrefix)
print("\nAdding prefix for test df justification")
addPrefix(dfTest.loc[ : , "justification"], jPrefix)


Adding prefix for train df justification


100%|██████████| 793/793 [00:00<00:00, 11714.82it/s]



Adding prefix for dev df justification


100%|██████████| 197/197 [00:00<00:00, 13181.85it/s]



Adding prefix for test df justification


100%|██████████| 200/200 [00:00<00:00, 13063.93it/s]


In [None]:
# Check dataset
dfTrain.head()

Unnamed: 0,claim,justification,questions
0,"Generate Questions: ""Unemployment is low becau...","Justification: Ocasio-Cortez said, ""Unemployme...",Can low unemployment rates be attributed to ev...
1,Generate Questions: Says the impeachment of Pr...,Justification: Gigot said if Trump is impeache...,Have any presidents who have been impeached in...
2,"Generate Questions: ""When I withdrew in June o...","Justification: Clinton said, ""When I withdrew ...",Did Obama lose this level of potential support...
3,Generate Questions: Text from someone “connect...,Justification: A text message circulating thro...,Did this information come from the police of t...
4,"Generate Questions: “Some states, like Montana...","Justification: Cuomo tweeted, ""Some states, li...","Are some states getting $300,000 in federal st..."


## Create Torch Dataset Class for our custom dataframe

In [None]:
class ClaimDecompDataset(Dataset):

  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: AutoTokenizer,
      source_max_tok_len: int = 850,
      target_max_tok_len: int = 500
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_tok_len
    self.target_max_token_len = target_max_tok_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    sample_row = self.data.iloc[index]

    source_encoding = tokenizer(
        sample_row["claim"],
        sample_row["justification"],
        max_length = self.source_max_token_len,
        padding="max_length",
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    target_encoding = tokenizer(
        sample_row["questions"],
        max_length = self.target_max_token_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    labels = target_encoding["input_ids"]
    labels[labels == 0] = -100

    return dict(
      claim = sample_row["claim"],
      justification = sample_row["justification"],
      questions = sample_row["questions"],
      input_ids = source_encoding["input_ids"].flatten(),
      attention_mask = source_encoding["attention_mask"].flatten(),
      labels = labels.flatten()
    )

# Fine-tuning our T5X Model

### Initialize Dataset for Alternative 1

In [None]:
train_dataset = ClaimDecompDataset(dfTrain, tokenizer)
dev_dataset = ClaimDecompDataset(dfDev, tokenizer)
test_dataset = ClaimDecompDataset(dfTest, tokenizer)

dev_dataset.data.shape

The following cell contains code to observe what fields each data entry in the dataset contains

In [None]:
#Preview data in custom constructed dataset
for data in dev_dataset:
  print(data["claim"])
  print("\n")
  print(data["justification"])
  print("\n")
  print(data["questions"])
  print("\n")
  print(data["input_ids"][:10])
  print(data["labels"][:10])
  print("\n")
  print(tokenizer.decode(data["labels"].squeeze()))
  break
  

### Alternative 1

Using HuggingFace Transformer Trainer classes

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    # precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    # return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
    return {"accuracy": accuracy, "recall": recall, "f1": f1}

In [None]:
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    seed=0,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.save_model("output/claimDecompModelV1")

In [None]:
modelz = AutoModelForSeq2SeqLM.from_pretrained("output/claimDecompModelV1")

### Alternative 2

Creating our own lightning module

#### Building our lightning module

In [None]:
class ClaimDecompDataModule(pl.LightningDataModule):

  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer: AutoTokenizer,
      batch_size: int = 4,
      source_max_token_len: int = 850,
      target_max_token_len: int = 500
  ):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self, stage=None):
    self.train_dataset = ClaimDecompDataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )

    self.test_dataset = ClaimDecompDataset(
        self.test_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = True,
        num_workers = 4
    )

  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        num_workers = 4
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        num_workers = 4
    )


#### Establish key parameters

In [None]:
BATCH_SIZE = 1
N_EPOCHS = 6

data_module = ClaimDecompDataModule(dfTrain, dfDev, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

#### Build custom model in native Pytorch

In [None]:
class ClaimDecompModel(pl.LightningModule):
  
  def __init__(self):
    super().__init__()
    self.model = AutoModelForSeq2SeqLM.from_pretrained("google/t5-v1_1-large")

  def forward(self, input_ids, attention_mask, labels = None):
    output = self.model(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels
    )

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar = True, logger = True)
    return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar = True, logger = True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar = True, logger = True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=0.0001)  


In [None]:
customModel = ClaimDecompModel()

## Training our T5X Model

### Create checkpoint callback to save fine-tuned models

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

### Create trainer using pytorch lightning

In [None]:
trainer = pl.Trainer(
    callbacks=[checkpoint_callback, TQDMProgressBar(refresh_rate=1)],
    max_epochs = N_EPOCHS,
    accelerator='gpu', 
    devices=2
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Load tensorboard to view training stats

In [None]:
!rm -rf lightning_logs

In [None]:
!pip show tensorflow


In [None]:
import os
os.environ['TENSORBOARD_BINARY'] = '/path/to/envs/my_env/bin/tensorboard'

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

### Start training

In [None]:
trainer.fit(customModel, data_module)

## Load Fine-Tuned Model for Prediction

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

trainedModel = ClaimDecompModel.load_from_checkpoint("checkpoints/V7L_B1_E12_SM850_TM500.ckpt")
# trainedModel = ClaimDecompModel.load_from_checkpoint("drive/MyDrive/past_checkpoints/V7L_B1_E12_SM850_TM500.ckpt")

trainedModel.model.to(device)
trainedModel.freeze()

## Evaluate Fine-Tuned Model

In [None]:
def compute_metrics(model, tokenizer, evalDf, sourceMaxLen, targetMaxLen, p):
    
    eval_dataset = ClaimDecompDataset(
        evalDf,
        tokenizer,
        sourceMaxLen, 
        targetMaxLen
    )
    
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "recall": recall, "f1": f1}

# Check T5X Model Config Parameters

In [None]:
 trainedModel.model.config

T5Config {
  "_name_or_path": "google/t5-v1_1-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.23.1",
  "use_cache": true,
  "vocab_size": 32128
}

# Running the T5X for question generation

In [None]:
def predict(model, claim, justification):
  claim_encoding = tokenizer(
      claim,
      justification,
      max_length = 850,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )
  claim_encoding.to(device)

  prediction = model.model.generate(
      input_ids=claim_encoding["input_ids"],
      attention_mask=claim_encoding["attention_mask"],
      num_beams=1,
      max_length=800,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )

  decoded_res = [
      tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=True)
      for pred in prediction
  ]
  print("Response: " + decoded_res[0])


def predictOrg(model, claim, justification):
  claim_encoding = tokenizer(
      claim,
      justification,
      max_length = 850,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )
  
  claim_encoding.to(device)

  prediction = model.generate(
      input_ids=claim_encoding["input_ids"],
      attention_mask=claim_encoding["attention_mask"],
      num_beams=1,
      max_length=800,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )

  decoded_res = [
      tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=True)
      for pred in prediction
  ]

  print("Response: " + decoded_res[0])

### Evaluating Our Fine-Tuned Model

In [None]:
claim = "Generate Questions: A Facebook post stated on January 31, 2021: Nancy Pelosi bought $1.25 million in Tesla stock the day before Joe Biden signed an order “for all federal vehicles” to be electric."

justification = """Justification: An image shared on Facebook claims that Nancy Pelosi bought $1.25 million in Tesla stock the day before Biden
signed an order for all federal vehicles to be electric, implying that she sought to profit from inside information about new
government policies. The House speaker did report transactions involving Tesla stock, but the post misrepresented the purchases
and Biden’s policies to create the false impression that the transactions represented improper insider trading in Tesla shares. The
statement contains an element of truth, but ignoring critical facts would give a different impression."""

c = "Generate Questions: The Arizona Forensic Audit of Maricopa County identified numerous anomalies, fraud, and Election law violations that are determinative, including 17,322 duplicate absentee ballot envelopes, which surged after the Election."

c2 = "Generate Questions: Donald Trump had a cup of coffee this morning and found that he was invited to the white house for the next election"

c3 = "Generate Questions: Putin has ordered the Russian invasion of Ukraine"

c5 = "Generate Questions: Boris Johnson just got elected as the new prime minister of the UK"

c5a = """Generate Questions: In a 2014 case involving a man convicted of abusing two underage girls, Judge Roy Moore disagreed and wrote the dissenting opinion"""
j5a = """Justification: An ad from the Jones campaign said that Moore disagreed and dissented in a case involving the abuse of two underage girls. There is no question that Moore dissented, but the ad provides no context for what Moore disagreed with. The glaring lack of detail leaves reasonable viewers with the impression that Moore disagreed with the conviction.
That’s not what he said. He disagreed with the court’s decision not to consider a legal question of admissible evidence. Several independent law professors told us that Moore raised a valid point that was legally separate from the underlying conviction in the case.
The ad leaves out critical context that gives a highly misleading impression. We rate this claim Mostly False."""

## Lets try generating questions from song lyrics
c6 = """Generate Questions: Welcome to Wonderland, I'll be your guide
Holding your hand under sapphire skies
Let's go exploring or we could just go for a walk
Welcome to Wonderland, where should we go
There's a tea party along down the road
Make an appearance and maybe they'll sing us a song"""


## More about otters
c7 = "Generate Questions: The otter population has been rebounding since the 1970s when Singapore started cleaning up its waterways. Their numbers are rising partially because otters face no significant danger from other predators and only encounter crocodiles in a wetlands mangrove in the city’s north-west."

# predict(trainedModel, claim, justification)
# predict(trainedModel, c5a, j5a)

c4 = "Generate Questions: Our CS3103 professor got COVID positive and conducted lecture over zoom instead of in-person classes"

politicalClaim = """Generate Questions: Says Sen. Jon Tester "believes that we should have a national registration so that the people of Montana should have to ask permission before they purchase a gun, ask permission from the federal government." """
polJustification = """Justification: Rosendale said Tester "believes that we should have a national registration so that the people of Montana should have to ask permission before they purchase a gun, ask permission from the federal government."
Tester’s record shows opposition to a federal list of lawfully-owned guns and gun owners. To claim he supports this is absurd.
We rate this Pants on Fire."""

predict(trainedModel, politicalClaim, polJustification)
# predict(trainedModel, c4, "")



Response: Did Tester say that he should have a national registration before purchasing a gun? Does Tester believe that he should be allowed to buy a gun in Montana without any restrictions? Has Tester said that he should have a national registration before buying a gun?


### Evaluating with the original model

In [None]:
version = "google/t5-v1_1-small" #"google/t5-v1_1-large" #"google/t5-v1_1-xl" is too huge for google colab to handle

ogmodel = AutoModelForSeq2SeqLM.from_pretrained(version)
ogtokenizer = AutoTokenizer.from_pretrained(version)

predictOrg(ogmodel, claim, justification)