# GW -Bert
#### training time ~25 hrs

Installing dependencies

In [17]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from torch.utils.data import Dataset, DataLoader
import argparse
import json
import torch
from tqdm import tqdm
import pandas as pd
import json
import random
import argparse
from sklearn.metrics import precision_recall_fscore_support, classification_report, f1_score

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification, BigBirdConfig
import torch
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

from sklearn import metrics

In [9]:
# Data Helpers function
def get_dataset_splits():
    train_fn = "/content/train.jsonl"
    dev_fn = "/content/dev.jsonl"
    test_fn = "/content/test.jsonl"

    def load_data(file_path):
        with open(file_path) as f:
            data = [json.loads(line) for line in f]
            X = [item["text"] for item in data]
            y = [item["label"] for item in data]
        return X, y

    X_train, y_train = load_data(train_fn)
    X_validation, y_validation = load_data(dev_fn)
    X_test, y_test = load_data(test_fn)

    return X_train, y_train, X_validation, y_validation, X_test, y_test

def get_cv_splits():
    X_train, y_train, X_validation, y_validation, X_test, y_test = get_dataset_splits()
    X, y = np.array(X_train + X_validation + X_test), np.array(y_train + y_validation + y_test)

    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    skf.get_n_splits(X, y)

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        yield X_train, y_train, X_test, y_test

def round_float(number):
    return str(round(number, 3) * 100)




In [2]:
class SequenceClassificationDataset(Dataset):
    def __init__(self, x, tokenizer):
        self.examples = x
        self.tokenizer = tokenizer
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

    def collate_fn(self, batch):
        model_inputs = self.tokenizer(
            [i for i in batch],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(self.device)
        return {"model_inputs": model_inputs}


In [10]:
def evaluate(gold, predictions):
    # Calculate precision, recall, F1 score, and accuracy
    pr = round_float(metrics.precision_score(gold, predictions))
    rc = round_float(metrics.recall_score(gold, predictions))
    f1 = round_float(metrics.f1_score(gold, predictions))
    acc = round_float(metrics.accuracy_score(gold, predictions))

    # Format the results as a string separated by "&"
    return " & ".join((pr, rc, f1, acc))

In [11]:
class SequenceClassificationDataset(Dataset):
    def __init__(self, x, y, tokenizer):
        # Combine input sequences (x) and labels (y) into a list of tuples
        self.examples = list(zip(x, y))
        self.tokenizer = tokenizer
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def __len__(self):
        # Return the total number of examples in the dataset
        return len(self.examples)

    def __getitem__(self, idx):
        # Return an example (a tuple of sequence and label) based on the index (idx)
        return self.examples[idx]

    def collate_fn(self, batch):
        # Tokenize sequences and create tensors for model inputs and labels
        model_inputs = self.tokenizer([i[0] for i in batch], return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
        labels = torch.tensor([i[1] for i in batch]).to(self.device)

        # Return a dictionary containing model inputs and labels
        return {"model_inputs": model_inputs, "label": labels}


In [12]:
def evaluate_epoch(model, dataset):
    # Set the model to evaluation mode
    model.eval()

    # Lists to store true labels, predicted labels, and class probabilities
    targets = []
    outputs = []
    probs = []

    # Disable gradient computation during evaluation
    with torch.no_grad():
        # Iterate through batches in the DataLoader
        for batch in DataLoader(dataset, batch_size=args.batch_size, collate_fn=dataset.collate_fn):
            # Forward pass to get model predictions
            output = model(**batch["model_inputs"])
            logits = output.logits

            # Extend lists with true labels, predicted labels, and class probabilities
            targets.extend(batch['label'].float().tolist())
            outputs.extend(logits.argmax(dim=1).tolist())
            probs.extend(logits.softmax(dim=1)[:, 1].tolist())

    # Return true labels, predicted labels, and class probabilities
    return targets, outputs, probs


In [13]:
def train_model(trainset, model_name):
    # Set device (cuda or cpu)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load configuration for the specified pre-trained model
    config = AutoConfig.from_pretrained(model_name)
    config.num_labels = 2  # Assuming it's a binary classification task
    config.gradient_checkpointing = True  # Enables gradient checkpointing for memory efficiency

    # Load or initialize the pre-trained sequence classification model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, config=config, cache_dir="../../transformer_models/").to(device)

    # Set up training parameters
    warmup_steps = 0
    train_dataloader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, collate_fn=trainset.collate_fn)
    t_total = int(len(train_dataloader) * args.num_epochs / args.gradient_accumulation_steps)

    # Set up optimizer and scheduler
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

    # Zero gradients and set up gradient scaler
    model.zero_grad()
    optimizer.zero_grad()
    use_amp = True
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    # Training loop
    for epoch in range(args.num_epochs):
        model.train()
        t = tqdm(train_dataloader)
        for i, batch in enumerate(t):
            with torch.cuda.amp.autocast(enabled=use_amp):
                output = model(**batch["model_inputs"], labels=batch['label'])
                loss = output.loss / args.gradient_accumulation_steps

            # Backward pass
            scaler.scale(loss).backward()

            if (i + 1) % args.gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()  # Update learning rate schedule
                optimizer.zero_grad()

    return model


In [None]:
import os
import argparse
import torch
from transformers import AutoTokenizer, AutoConfig
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup

# Assuming you have defined or imported the necessary functions and classes like get_dataset_splits,
# SequenceClassificationDataset, train_model, evaluate_epoch, and evaluate.

def main(args):
    model_name = args.model_name
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except:
        tokenizer = AutoTokenizer.from_pretrained("roberta-base")

    out_str = os.path.basename(model_name) + " & "

    # Load or define necessary functions and classes like get_dataset_splits, SequenceClassificationDataset, train_model, evaluate_epoch, and evaluate.
    X_train, y_train, X_validation, y_validation, X_test, y_test = get_dataset_splits()
    trainset = SequenceClassificationDataset(X_train, y_train, tokenizer)
    devset = SequenceClassificationDataset(X_validation, y_validation, tokenizer)
    model = train_model(trainset, model_name)

    # Evaluate dev set
    targets, outputs, probs = evaluate_epoch(model, devset)
    out_str += evaluate(targets, outputs) + " & "

    # Evaluate test set
    devset = SequenceClassificationDataset(X_test, y_test, tokenizer)
    targets, outputs, probs = evaluate_epoch(model, devset)

    out_str += evaluate(targets, outputs) + r" \\ "

    print(out_str)
    return model, tokenizer

if __name__ == "__main__":
    # Assuming you have already mounted Google Drive
    # If not, you can do it using:
    # from google.colab import drive
    # drive.mount('/content/drive')

    parser = argparse.ArgumentParser()
    parser.add_argument('--save_path', type=str, default="/content/drive/MyDrive/envclaim-distilroberta", help='Folder to save the weights')
    # ... (other arguments)

    args = parser.parse_args()
    model, tokenizer = main(args)

    if args.do_save:
        # Save to Google Drive
        model.save_pretrained(args.save_path)
        tokenizer.save_pretrained(args.save_path)
