<a href="https://colab.research.google.com/github/DwarakaVelasiri/Dwaraka-PROJECT_GUTENBERG_GOTHIC_FICTION_TEXT_GENERATION_gpt2/blob/main/Encoder_Only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
import os
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import huggingface_hub

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
auth_token = "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"
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, use_auth_token=auth_token )
#Logging into Hugging Face
huggingface_hub.login(token=auth_token)
# Load the CoLA dataset
train_file = '/content/sample_data/in_domain_train.tsv'
dev_file = '/content/sample_data/in_domain_dev.tsv'
test_file = '/content/sample_data/out_of_domain_dev.tsv'

def load_cola_data(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    sentences = []
    labels = []
    for line in lines[1:]:
        parts = line.strip().split('\t')
        sentence = parts[3]
        label = int(parts[1])
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

train_sentences, train_labels = load_cola_data(train_file)
dev_sentences, dev_labels = load_cola_data(dev_file)
test_sentences, test_labels = load_cola_data(test_file)

# Tokenize the input sentences
train_tokens = [tokenizer.tokenize(sentence) for sentence in train_sentences]
dev_tokens = [tokenizer.tokenize(sentence) for sentence in dev_sentences]
test_tokens = [tokenizer.tokenize(sentence) for sentence in test_sentences]

# Prepare the input sequences
train_inputs = [tokenizer.encode_plus(tokenized_sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True) for tokenized_sentence in train_tokens]
dev_inputs = [tokenizer.encode_plus(tokenized_sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True) for tokenized_sentence in dev_tokens]
test_inputs = [tokenizer.encode_plus(tokenized_sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True) for tokenized_sentence in test_tokens]

# Convert the input sequences to PyTorch tensors
train_input_ids = torch.tensor([input_dict['input_ids'] for input_dict in train_inputs])
train_attention_masks = torch.tensor([input_dict['attention_mask'] for input_dict in train_inputs])
train_labels = torch.tensor(train_labels)
dev_input_ids = torch.tensor([input_dict['input_ids'] for input_dict in dev_inputs])
dev_attention_masks = torch.tensor([input_dict['attention_mask'] for input_dict in dev_inputs])
dev_labels = torch.tensor(dev_labels)
test_input_ids = torch.tensor([input_dict['input_ids'] for input_dict in test_inputs])
test_attention_masks = torch.tensor([input_dict['attention_mask'] for input_dict in test_inputs])
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
dev_dataset = TensorDataset(dev_input_ids, dev_attention_masks, dev_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Freeze all but the last layer of the BERT model
for param in model.base_model.parameters():
    param.requires_grad = False

# Define the optimizer and learning rate schedule
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def accuracy(logits, labels):
    preds = np.argmax(logits, axis=1)
    return np.sum(preds == labels) / labels.shape[0]

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(3):  
    train_loss = 0
    model.train()

    for batch in train_dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        outputs = model(batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = train_loss / len(train_dataloader)
    print('Average training loss: {}'.format(avg_train_loss))

    # Evaluate the model on the validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in dev_dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
        logits = outputs.logits
        loss = outputs.loss
        eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = batch_labels.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_examples += label_ids.shape[0]
        nb_eval_steps += 1

    avg_eval_loss = eval_loss / len(dev_dataloader)
    print('Average validation loss: {}'.format(avg_eval_loss))
    print('Validation accuracy: {}'.format(eval_accuracy / nb_eval_examples))

# Evaluate the model on the test set
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
model.eval()

for batch in test_dataloader:
    batch_input_ids, batch_attention_masks, batch_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)

    logits = outputs.logits
    loss = outputs.loss
    test_loss += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()
    tmp_test_accuracy = accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    nb_test_examples += label_ids.shape[0]
    nb_test_steps += 1

avg_test_loss = test_loss / len(test_dataloader)
print('Average test loss: {}'.format(avg_test_loss))

# Save the fine-tuned model and tokenizer
output_dir = "Dwaraka/Sentence_Classification_CoLA_BERT_base_uncased_Encoder_Only_Model"
model.save_pretrained(output_dir,push_to_hub=True)
tokenizer.save_pretrained(output_dir,push_to_hub=True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Average training loss: 0.6238514449614198
Average validation loss: 0.6215987521059373
Validation accuracy: 0.022253530689842475
Average training loss: 0.6113153907345302
Average validation loss: 0.6227778056088615
Validation accuracy: 0.022253530689842475
Average training loss: 0.6095231347786847
Average validation loss: 0.6216687717858482
Validation accuracy: 0.022253530689842475
Average test loss: 0.6298016449984383


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

('Dwaraka/Sentence_Classification_CoLA_BERT_base_uncased_Encoder_Only_Model/tokenizer_config.json',
 'Dwaraka/Sentence_Classification_CoLA_BERT_base_uncased_Encoder_Only_Model/special_tokens_map.json',
 'Dwaraka/Sentence_Classification_CoLA_BERT_base_uncased_Encoder_Only_Model/vocab.txt',
 'Dwaraka/Sentence_Classification_CoLA_BERT_base_uncased_Encoder_Only_Model/added_tokens.json')