In [1]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient() 

personal_key_for_api = user_secrets.get_secret("wandb")

! wandb login $personal_key_for_api

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [2]:
!pip install peft
!pip install rouge_score
!pip install bert_score
!pip install textstat

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.10.0
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=6dd87f5ff7928d0d1ede87a47083d96c2ecf7181e3b0642572b6d90b326299e8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert_score
  Download

In [3]:
import sys
import os
import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import gensim
import json
import numpy as np
import pandas as pd
import sklearn
import scipy
from datasets import load_dataset, concatenate_datasets
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, TrainerCallback
from accelerate import Accelerator
from accelerate.data_loader import DataLoader
import peft
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from peft import PeftModel, PeftConfig
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from textstat import flesch_kincaid_grade, dale_chall_readability_score, coleman_liau_index

# Color scheme for comments (NEED TO INSTALL "Colorful Comments" vscode extension for this to work):
# 0) # Regular comments
# 1) #* Completed functions
# 2) #! Incomplete, needs to be done
# 3) #? Doubtful, needs to be checked
# 4) #^ Important points to note
# 5) #& Alternative options that could be tested
# 6) #~ Explaining ideas or concepts
# 7) #TODO todo tasks
# 8) #// redundant info


# Converting to GPU if available
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)


# The following class is for the purpose of printing the predicted summary and loss after each epoch (which is not available in the Trainer class by default)
class PrintSummaryCallback(TrainerCallback):
    def __init__(self, tokenizer, max_output_length, eval_dataloader):
        self.tokenizer = tokenizer
        self.max_output_length = max_output_length
        self.eval_dataloader = eval_dataloader

    def on_step_end(self, args, state, control, model=None, **kwargs):
        if self.eval_dataloader is not None:
            # Print predicted summary
            batch = next(iter(self.eval_dataloader))  # Get the first batch from the dataloader
            input_ids = batch["input_ids"].to(device)
            if model is not None and hasattr(model, 'generate'):
                outputs = model.generate(input_ids, max_length=self.max_output_length)
                summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                print(f"Predicted Summary: {summary}")
            elif model is None:
                print("Model is not provided.")
            else:
                print("Model does not have a generate method.")

            # Print loss
            if hasattr(state, 'log_history') and len(state.log_history) > 0 and 'loss' in state.log_history[-1]:
                train_loss = state.log_history[-1]["loss"]
                print(f"Iteration {state.global_step} - Train Loss: {train_loss}")
        else:
            print("Evaluation dataloader is not provided.")

class EvaluateSummaryCallback(TrainerCallback):
    def __init__(self, tokenizer,compute_metrics, max_output_length, eval_dataloader):
        self.tokenizer = tokenizer
        self.compute_metrics = compute_metrics
        self.max_output_length = max_output_length
        self.eval_dataloader = eval_dataloader

    def on_step_end(self, args, state, control, model=None, **kwargs):
        # Check if compute_metrics is available
        if self.compute_metrics is None:
            print("No compute_metrics function provided.")
            return

        # Evaluate the model the evaluation dataloader
        if self.eval_dataloader is not None and model is not None and hasattr(model, 'generate'):
            predictions = []
            labels = []
            
            # Get the first batch from the dataloader
            batch = next(iter(self.eval_dataloader))
            input_ids = batch["input_ids"].to(device)
            batch_predictions = model.generate(input_ids, max_length=self.max_output_length)

            # Pad the batch_predictions to max_output_length
            if batch_predictions.size(1) < self.max_output_length:
                batch_predictions = F.pad(batch_predictions, (0, self.max_output_length - batch_predictions.size(1)))

            # Decode the tensor to strings
            batch_predictions = [self.tokenizer.decode(g, skip_special_tokens=True) for g in batch_predictions]
            labels = [self.tokenizer.decode(l, skip_special_tokens=True) for l in batch["labels"]]
            
            predictions.extend(batch_predictions)
            labels.extend(labels)

            print(f"Number of predictions: {len(predictions)}")
            print(f"Number of labels: {len(labels)}")

            prediction_output = {"predictions": predictions, "label_ids": labels}
            metrics = self.compute_metrics(prediction_output)

            # Print the evaluation metrics
            print("Evaluation Metrics:")
            for metric_name, metric_value in metrics.items():
                print(f"{metric_name}: {metric_value}")


def compute_metrics(eval_pred):
    # Extracting predictions and labels
    generated_summaries = eval_pred["predictions"]
    reference_summaries = eval_pred["label_ids"]

    # ROUGE scores computation
    def compute_rouge_scores(hypotheses, references):
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge1_scores = []
        rouge2_scores = []
        rougeL_scores = []

        for hyp, ref in zip(hypotheses, references):
            scores = scorer.score(hyp, ref)
            rouge1_scores.append(scores['rouge1'].fmeasure)
            rouge2_scores.append(scores['rouge2'].fmeasure)
            rougeL_scores.append(scores['rougeL'].fmeasure)

        return {
            'rouge1': np.mean(rouge1_scores),
            'rouge2': np.mean(rouge2_scores),
            'rougeL': np.mean(rougeL_scores)
        }

    rouge_scores = compute_rouge_scores(generated_summaries, reference_summaries)

    # BERTScore computation
    def compute_bert_scores(hypotheses, references):
        P, R, F1 = bert_score(hypotheses, references, lang='en')
        return {
            'bert_P': P.mean().item(),
            'bert_R': R.mean().item(),
            'bert_F1': F1.mean().item()
        }

    bert_scores = compute_bert_scores(generated_summaries, reference_summaries)

    # Readability scores computation (FKGL, DCRS, CLI, LENS)
    def compute_readability_scores(hypotheses):
        # Compute FKGL, DCRS, CLI, LENS here
        # Example code for FKGL (You may need to install the 'textstat' library for FKGL and DCRS):
        # from textstat import flesch_kincaid_grade, dale_chall_readability_score, coleman_liau_index
        # fkgl_scores = [flesch_kincaid_grade(summary) for summary in hypotheses]
        # dcrs_scores = [dale_chall_readability_score(summary) for summary in hypotheses]
        # cli_scores = [coleman_liau_index(summary) for summary in hypotheses]
        # lens_scores = []  # Implement LENS computation
        fkgl_scores = []
        dcrs_scores = []
        cli_scores = []
        lens_scores = []  # Placeholder for LENS
        return {
            'fkgl': np.mean(fkgl_scores),
            'dcrs': np.mean(dcrs_scores),
            'cli': np.mean(cli_scores),
            'lens': np.mean(lens_scores)
        }

    readability_scores = compute_readability_scores(generated_summaries)

    # Factuality scores computation (AlignScore, SummaC)
    def compute_factuality_scores(hypotheses):
        # Compute AlignScore and SummaC here
        # Example code for AlignScore and SummaC:
        # align_scores = []  # Implement AlignScore computation
        # summac_scores = []  # Implement SummaC computation
        align_scores = []  # Placeholder for AlignScore
        summac_scores = []  # Placeholder for SummaC
        return {
            'align_score': np.mean(align_scores),
            'summac': np.mean(summac_scores)
        }

    factuality_scores = compute_factuality_scores(generated_summaries)

    return {
        'rouge': rouge_scores,
        'bert_score': bert_scores,
        'readability': readability_scores,
        'factuality': factuality_scores
    }

def preprocess_data(examples, tokenizer, max_input_length, max_output_length):
    # Tokenize input articles
    inputs = tokenizer(examples["article"], 
                       max_length=max_input_length, 
                       padding="max_length", 
                       truncation=True, 
                       return_tensors="pt")

    # Tokenize target lay summaries
    targets = tokenizer(examples["lay_summary"], 
                        max_length=max_output_length, 
                        padding="max_length", 
                        truncation=True, 
                        return_tensors="pt")

    # Assign labels to model inputs
    inputs["labels"] = targets.input_ids

    return inputs

def collate_fn(batch):
    inputs = [item["input_ids"] for item in batch]
    targets = [item["labels"] for item in batch]

    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)

    return {'input_ids': inputs, 'labels': targets}

def train(data_dir, model_save_path, model_name="google/flan-t5-base", max_input_length=512, max_output_length=128, num_epochs=6, lora_rank=8, lora_alpha=32, lora_dropout=0.1, train_batch_size=8, eval_batch_size=8, learning_rate=1e-4, weight_decay=0.01, logging_steps=10, eval_steps=100, save_steps=100):
    
    # load datasets from jsonl files and ignore the '\n' characters in the "article" attribute
    elife_dataset = load_dataset("json", data_files={
        "train": os.path.join(data_dir, "eLife_train.jsonl"),
        "validation": os.path.join(data_dir, "eLife_val.jsonl")
    })

    plos_dataset = load_dataset("json", data_files={
        "train": os.path.join(data_dir, "PLOS_train.jsonl"),
        "validation": os.path.join(data_dir, "PLOS_val.jsonl")
    })

    # Concatenate the datasets into a single dataset with both train and validation splits
    datasets_train = concatenate_datasets([elife_dataset["train"], plos_dataset["train"]])
    datasets_val = concatenate_datasets([elife_dataset["validation"], plos_dataset["validation"]])
    datasets = {"train": datasets_train, "validation": datasets_val}

    print("Loaded datasets")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model = model.to(device)  # Move model to GPU/CPU

    print("Loaded tokenizer and model")

    # Preprocess data
    datasets = {
    "train": datasets["train"].map(
        lambda x: preprocess_data(x, tokenizer, max_input_length, max_output_length), 
        batched=True),
    "validation": datasets["validation"].map(
        lambda x: preprocess_data(x, tokenizer, max_input_length, max_output_length), 
        batched=True),
    }

    datasets = {
    "train": datasets["train"].with_format("torch").map(
        lambda x: {k: torch.tensor(v).to(device) if isinstance(v, np.ndarray) else v for k, v in x.items()},
        batched=True),
    "validation": datasets["validation"].with_format("torch").map(
        lambda x: {k: torch.tensor(v).to(device) if isinstance(v, np.ndarray) else v for k, v in x.items()},
        batched=True),
    }

    print("Preprocessed data")

    # Set up PEFT configuration
    peft_config = LoraConfig(
        r=lora_rank,
        lora_alpha=lora_alpha,
        target_modules=["q", "v"],  # Adjust target modules according to your base model
        lora_dropout=lora_dropout,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    # Prepare model for PEFT
    model = prepare_model_for_kbit_training(model)

    print("Prepared model for PEFT")

    # Set up accelerator
    accelerator = Accelerator()

    # Prepare dataloaders
    train_dataloader = DataLoader(datasets["train"], batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
    eval_dataloader = DataLoader(datasets["validation"], batch_size=eval_batch_size, collate_fn=collate_fn)
    
    for batch in train_dataloader:
        print(batch.keys())
        break

    print("Prepared dataloaders")
    
    # Prepare optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Prepare model, optimizer, and dataloaders for accelerator
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    print("Prepared model, optimizer, and dataloaders for accelerator")

    # Set up PEFT trainer
    peft_model = get_peft_model(model.to(device), peft_config)

    trainer = Trainer(
        model=peft_model.to(device),
        args=TrainingArguments(
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            learning_rate=learning_rate,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            logging_steps=logging_steps,
            evaluation_strategy="steps",
            eval_steps=eval_steps,
            save_strategy="steps",
            save_steps=save_steps,
#             report_to="none",
            output_dir=model_save_path,
        ),
        train_dataset=datasets["train"],
        eval_dataset=datasets["validation"],
        # compute_metrics=compute_metrics, # Should comment this out when submitting, to avoid unnecessary output and save time
        # callbacks=[PrintSummaryCallback(tokenizer, max_output_length, eval_dataloader), EvaluateSummaryCallback(tokenizer, compute_metrics,max_output_length, eval_dataloader)],
    )


    print("Set up PEFT trainer")

    # Train loop
    trainer.train()

    print("Training complete")

    # Save model
    trainer.save_model(model_save_path)

    print("Model saved")

def test(test_dir, model_load_path, predictions_save_path, model_name="google/flan-t5-small", max_input_length=512, max_output_length=128):
    # Load tokenizer and model
    peft_model_path = model_load_path
    config = PeftConfig.from_pretrained(peft_model_path)
    model =  AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
    model = PeftModel.from_pretrained(model, peft_model_path)
    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

    # Load test datasets
    elife_dataset = load_dataset("json", data_files={
        "test": os.path.join(test_dir, "eLife_test.jsonl")
    })

    plos_dataset = load_dataset("json", data_files={
        "test": os.path.join(test_dir, "PLOS_test.jsonl")
    })

    # Find the number of examples in each dataset
    num_elife = len(elife_dataset["test"])
    num_plos = len(plos_dataset["test"])

    # Concatenate the datasets into a combined test dataset
    test_dataset = concatenate_datasets([elife_dataset["test"], plos_dataset["test"]])

    # Generate summaries
    summaries = []
    for example in test_dataset:
        inputs = tokenizer(example["article"], return_tensors="pt", truncation=True, max_length=max_input_length)
        summary_ids = model.generate(**inputs, max_length=max_output_length)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Save summaries
    with open(os.path.join(predictions_save_path, "elife.txt"), "w") as f:
        f.write("\n".join(summaries[:num_elife]))
    with open(os.path.join(predictions_save_path, "plos.txt"), "w") as f:
        f.write("\n".join(summaries[num_elife:]))
            
# "train" or "test" the model
if __name__ == '__main__':
    if device == "cuda":
        torch.cuda.empty_cache()
    print(f"Using {device} device")

    # For purpose of kaggle uncomment the following code block
    data_dir = '/kaggle/input/nlp-a3'
    model_save_path = '/kaggle/working/models'
    
    print("Training model")
    train(data_dir, model_save_path)
    
    
    # For general use (with run_model.sh script) uncomment the following code block
    
    # arg[1] contains whether training or testing
    # If arg[1] is train, arg[2] contains the path to data directory, arg[3] contains the path to save the model
#     if sys.argv[1] == "train":
#         data_dir = sys.argv[2]
#         model_save_path = sys.argv[3]

#         # Train the model
#         print("Training Model...")
#         train(data_dir, model_save_path)

#     # If arg[1] is test, then arg[2] contains path to the testfiles directory, arg[3] contains path to load model from, arg[4] contains path to save the predictions
#     if sys.argv[1] == "test":
#         print(sys.argv)
#         test_dir = sys.argv[2]
#         model_load_path = sys.argv[3]
#         predictions_save_path = sys.argv[4]

#         # Test the model
#         print("Testing Model...")
#         test(test_dir, model_load_path, predictions_save_path)

2024-04-25 20:13:36.553803: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 20:13:36.553927: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 20:13:36.682718: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using cuda device
Training model


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Loaded datasets


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loaded tokenizer and model


Map:   0%|          | 0/29119 [00:00<?, ? examples/s]

Map:   0%|          | 0/1617 [00:00<?, ? examples/s]

Map:   0%|          | 0/29119 [00:00<?, ? examples/s]

Map:   0%|          | 0/1617 [00:00<?, ? examples/s]

Preprocessed data
Prepared model for PEFT
dict_keys(['input_ids', 'labels'])
Prepared dataloaders
Prepared model, optimizer, and dataloaders for accelerator


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Set up PEFT trainer


[34m[1mwandb[0m: Currently logged in as: [33mcs5210607[0m ([33mcs52106007[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240425_202135-zoys8ujr[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mapricot-sky-5[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/cs52106007/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/cs52106007/huggingface/runs/zoys8ujr[0m


Step,Training Loss,Validation Loss
100,2.5809,2.369788
200,2.625,2.334898
300,2.4108,2.315665
400,2.4428,2.301
500,2.4845,2.280922
600,2.4259,2.25586
700,2.4522,2.247963
800,2.3644,2.239647
900,2.4478,2.236814
1000,2.312,2.225851


Training complete
Model saved
