In [1]:
#Installing the required libraries for working with transformers and model acceleration
!pip install transformers[torch]
!pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
#Importing necessary libraries and frameworks
import pandas as pd
import torch

import os

from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import get_linear_schedule_with_warmup, Trainer, TrainingArguments

import logging

In [3]:
#Setting logging level to ERROR for transformers to avoid clutter
logging.getLogger("transformers").setLevel(logging.ERROR)

In [4]:
#Creating a dataframe to store literature review summaries
columns = ['Paper', 'Problem Addressed', 'Methods Used', 'Key Findings', 'URL']
literature_review = pd.DataFrame(columns=columns)

In [5]:
#Function to add summaries of papers to the dataframe
def add_paper_summary(paper, problem, methods, findings, url):
    global literature_review
    summary = pd.DataFrame([[paper, problem, methods, findings, url]], columns=columns)
    literature_review = pd.concat([literature_review, summary], ignore_index=True)
    
#Adding a sample paper summary to the literature review dataframe
add_paper_summary(
    paper="Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer",
    problem="Investigate the capabilities and limitations of transfer learning in NLP using a unified model architecture.",
    methods="Uses the T5 model which treats every NLP problem as a text-to-text task.",
    findings="Shows that a single model can perform well across diverse NLP tasks, suggesting the efficacy of transfer learning.",
    url="https://arxiv.org/abs/1910.10683"
)

#Printing the dataframe to verify contents
print(literature_review)

                                               Paper  \
0  Exploring the Limits of Transfer Learning with...   

                                   Problem Addressed  \
0  Investigate the capabilities and limitations o...   

                                        Methods Used  \
0  Uses the T5 model which treats every NLP probl...   

                                        Key Findings  \
0  Shows that a single model can perform well acr...   

                                URL  
0  https://arxiv.org/abs/1910.10683  


In [6]:
#Initializing a tokenizer and a model for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

#Setting the model to evaluation mode
model.eval()  

#Sample text for encoding
text = "Here is some text to encode"

#Encoding the text using the BERT tokenizer
encoded_input = tokenizer(text, return_tensors='pt')

#Performing a forward pass to get model outputs
with torch.no_grad():
    outputs = model(**encoded_input)

#Extracting the last hidden states from the model output
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states)

tensor([[[-0.0549,  0.1053, -0.1065,  ..., -0.3551,  0.0686,  0.6506],
         [-0.5759, -0.3650, -0.1383,  ..., -0.6782,  0.2092, -0.1639],
         [-0.1641, -0.5597,  0.0150,  ..., -0.1603, -0.1346,  0.6216],
         ...,
         [ 0.2448,  0.1254,  0.1587,  ..., -0.2749, -0.1163,  0.8809],
         [ 0.0481,  0.4950, -0.2827,  ..., -0.6097, -0.1212,  0.2527],
         [ 0.9046,  0.2137, -0.5897,  ...,  0.3040, -0.6172, -0.1950]]])


In [7]:
#Configuring and initializing a BERT Model for Masked Language Modeling (MLM)
config = BertConfig.from_pretrained("bert-base-uncased")

model = BertForMaskedLM.from_pretrained("bert-base-uncased", config=config, ignore_mismatched_sizes=True)

In [8]:
#Defining a custom dataset class for handling text data for language modeling
class MyDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_length)
        return {key: torch.tensor(val) for key, val in encoding.items()}

#Setting up tokenizer and data collator for masked language modeling
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

#Creating a dataset and setting up training arguments
train_texts = ["Insert your training texts here..."]
train_dataset = MyDataset(train_texts, tokenizer, max_length=128)

model = BertForMaskedLM.from_pretrained("bert-base-uncased")

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

#Initializing the Trainer and training the model, and later saving it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

trainer.train()

model.save_pretrained("fine-tuned-bert")

***** Running training *****
  Num examples = 1
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3
  Number of trainable parameters = 109514298


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in fine-tuned-bert\config.json


{'train_runtime': 71.8992, 'train_samples_per_second': 0.042, 'train_steps_per_second': 0.042, 'train_loss': 6.790072123209636, 'epoch': 3.0}


Model weights saved in fine-tuned-bert\pytorch_model.bin


In [9]:
#Reading the csv file of the dataset and storing it in a variable named df
df = pd.read_csv('eval_results.csv')

#Printing the columns of the dataset
print(df.columns)

Index(['runname', 'steps', 'agg_score', 'commonsense_qa/acc',
       'commonsense_qa/acc_norm', 'hellaswag/acc', 'hellaswag/acc_norm',
       'openbookqa/acc', 'openbookqa/acc_norm', 'piqa/acc', 'piqa/acc_norm',
       'siqa/acc', 'siqa/acc_norm', 'winogrande/acc', 'winogrande/acc_norm',
       'sciq/acc', 'sciq/acc_norm', 'arc/acc', 'arc/acc_norm', 'mmlu/acc',
       'mmlu/acc_norm'],
      dtype='object')


In [10]:
#Defining the name of the column in the dataframe that contains the input text for the model
input_column = 'runname'

#Defining the name of the column in the dataframe that contains the target values or labels for the model
label_column = 'agg_score'

In [11]:
#Splitting the data into training and testing datasets
train_df, test_df = train_test_split(df, test_size=0.1)

In [None]:
#Defining another custom dataset class for text classification
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.data.iloc[index][input_column])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text, None, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', return_token_type_ids=True, truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        return {'ids': torch.tensor(ids, dtype=torch.long), 'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.data.iloc[index][label_column], dtype=torch.float)}

    def __len__(self):
        return self.len

#Initializing data loaders for the training and testing sets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
training_set = TextDataset(train_df, tokenizer)
testing_set = TextDataset(test_df, tokenizer)

#Setting parameters for DataLoader
train_params = {'batch_size': 16, 'shuffle': True}
test_params = {'batch_size': 16, 'shuffle': True}

#Creating DataLoader for training and testing
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

#Setting up and training a BERT model for sequence classification
num_labels = 1
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

#Training and validating the sequence classification model
def train(epoch):
    model.train()
    for _, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        targets = data['targets'].to(device)
        outputs = model(ids, mask)
        optimizer.zero_grad()
        loss = torch.nn.functional.mse_loss(outputs.logits.squeeze(), targets)
        loss.backward()
        optimizer.step()
        if _ % 500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

for epoch in range(3):
    train(epoch)

def validate():
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            targets = data['targets'].to(device)
            outputs = model(ids, mask)
            fin_targets.extend(targets.cpu().detach().numpy())
            fin_outputs.extend(outputs.logits.squeeze().cpu().detach().numpy())
    return fin_targets, fin_outputs

#Calling validate function to get predictions and true values from the test set
targets, outputs = validate()

#Calculating and printing the mean squared error between predicted outputs and actual targets
val_loss = torch.nn.functional.mse_loss(torch.tensor(outputs), torch.tensor(targets))
print("Validation Loss:", val_loss.item())

loading file vocab.txt from cache at C:\Users\ad22acb/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\ad22acb/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\ad22acb/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer

Epoch: 0, Loss:  0.14177370071411133
