<a href="https://colab.research.google.com/github/AQEEL-AWAN2362/NLP-Tutorial/blob/main/BERT_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade huggingface transformers datasets fsspec



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import pipeline

In [None]:
# loading tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
dataset = load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
train_dataset = dataset["train"].select(range(1000))
test_dataset = dataset["test"].select(range(500))
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

In [None]:
# # tokenize function
# def tokenize(batch):
#   max_length = 256
#   return tokenizer(batch["text"], padding='max_length', truncation=True, max_length = max_length)
# train_dataset= train_dataset.map(tokenize, batched=True, remove_columns=["text"])
# test_dataset= test_dataset.map(tokenize, batched=True, remove_columns=["text"]) # remove raw text for memory
# test_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 500
})

In [None]:
# # tokenize function
def tokenize(batch):
  max_length = 256
  return tokenizer(batch["text"], padding='max_length', truncation=True, max_length = max_length)

In [None]:
def preprocess(ds):
  ds=ds.map(tokenize, batched=True, remove_columns=["text"])
  ds= ds.rename_columns({"label":"labels"})
  ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
  return ds

In [None]:
# Select subsets and apply tokenization and do some format changes
train_dataset = preprocess(train_dataset) # Assign the returned dataset
test_dataset = preprocess(test_dataset)   # Assign the returned dataset
print(train_dataset)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})


In [None]:
# Setup Data Collator
from transformers import DataCollatorForSeq2Seq
seq2seq_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# training argumemnts
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=6,  # batch size per device during training
    per_device_eval_batch_size=6,   # batch size for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    learning_rate=2e-5,              # learning rate
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to ="none"
)

# trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # data_collator=seq2seq_collator,
)

In [None]:
# train the model
trainer.train()

Step,Training Loss
10,0.9275
20,0.85
30,0.7085
40,0.6137
50,0.4564
60,0.3469
70,0.2745
80,0.2097
90,0.1506
100,0.0989


TrainOutput(global_step=501, training_loss=0.09637786487962989, metrics={'train_runtime': 247.1829, 'train_samples_per_second': 12.137, 'train_steps_per_second': 2.027, 'total_flos': 394666583040000.0, 'train_loss': 0.09637786487962989, 'epoch': 3.0})

In [None]:
# save the model and trainer
tokenizer.save_pretrained("./bert_finetuned_imdb_tokenizer")
model.save_pretrained("./bert_finetuned_imdb_model")

In [None]:
# evaluate
metrics= trainer.evaluate()
print(metrics)

In [None]:
#prediction
tokenizer = BertTokenizer.from_pretrained('bert_finetuned_imdb_model')
model = BertForSequenceClassification.from_pretrained('./bert_finetuned_imdb_model')

from transformers import pipeline
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
# Predict
text = "This movie was amazing and I loved the acting!"
result = classifier(text)
print(result)