<a href="https://colab.research.google.com/github/0xSh4dy/Transformers/blob/master/GrammarCheckerUsingBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Grammar Checking

### Thanks to [Chris McCormick](https://mccormickml.com/) for the [awesome blog post](https://mccormickml.com/2019/07/22/BERT-fine-tuning/) on fine-tuning BERT. This notebook is inspired from that blog.

In [None]:
!pip install transformers datasets wget

In [4]:
import torch

if torch.cuda.is_available():
  print("GPU is available")
  device = torch.device("cuda")
else:
  print("GPU is not available")
  device = torch.device("cpu")


GPU is available


### Downloading the dataset

In [5]:
import os,wget
dataset_url = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip"

if not os.path.exists("./cola_public_1.1.zip"):
  print("Downloading dataset")
  wget.download(dataset_url,"./cola_public_1.1.zip")
else:
  print("Dataset has already been downloaded")

Downloading dataset


### Unzip the dataset

In [6]:
if not os.path.exists("./cola_public/"):
  !unzip cola_public_1.1.zip
else:
  print("Dataset has already been unzipped")

Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


### Preprocessing the dataset

In [7]:
import pandas as pd

train_tsv_path = "./cola_public/raw/in_domain_train.tsv"
df = pd.read_csv(train_tsv_path,delimiter='\t',header=None,names=["src","label","src_tag","sentence"])
n_sentences = df.shape[0]
print(f'Number of sentences = {n_sentences}')
df.sample(5)

Number of sentences = 8551


Unnamed: 0,src,label,src_tag,sentence
2146,l-93,1,,Janet broke the vase.
5271,b_82,1,,That he has blood on his hands proves that Joh...
5567,b_73,1,,Jane has more nearly as many too many than Mary.
2401,l-93,1,,I filled the pail with water.
7297,sks13,1,,Mary should buy some flowers on Sunday at 5 o'...


In [8]:
sentences = df.sentence.values
labels = df.label.values

### Tokenization

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
max_len = 0
for sentence in sentences:
  input_ids = tokenizer.encode(sentence,add_special_tokens=True)
  max_len = max(max_len,len(input_ids))

print(f"Max length = {max_len}")

Max length = 47


In [11]:
input_ids = []
attention_masks = []

for sentence in sentences:
  encoded_dict = tokenizer.encode_plus(
      sentence,
      add_special_tokens = True,
      max_length = 64,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt'
  )
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Convert everything into tensors

In [12]:
input_ids = torch.cat(input_ids,dim=0)
attention_masks = torch.cat(attention_masks,dim=0)
label_tensors = torch.tensor(labels)

### Splitting the dataset into training and test sets

In [18]:
from torch.utils.data import TensorDataset,random_split,DataLoader, RandomSampler,SequentialSampler

dataset = TensorDataset(input_ids,attention_masks,label_tensors)

train_size = int(0.9*len(dataset))
val_size = len(dataset)-train_size

train_dataset,val_dataset = random_split(dataset,[train_size,val_size])

print(f"Number of training samples = {train_size}")
print(f"Number of validation samples = {val_size}")

batch_size = 32

train_dl = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size)

test_dl = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size)



Number of training samples = 7695
Number of validation samples = 856


### Training the Model

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

model.cuda()

In [27]:
from transformers import AdamW,get_linear_schedule_with_warmup
import random
import numpy as np
from tqdm.auto import tqdm

learning_rate = 2e-5
adam_epsilon = 1e-8
n_epochs = 4

optimizer = AdamW(
    model.parameters(),
    lr = learning_rate,
    eps = adam_epsilon
)

n_training_steps = n_epochs * len(train_dl)
scheduler = get_linear_schedule_with_warmup(
  optimizer = optimizer,
  num_warmup_steps = 0,
  num_training_steps = n_training_steps
)

pbar = tqdm(total=n_training_steps)

curr_seed = 50
random.seed(curr_seed)
np.random.seed(curr_seed)
torch.manual_seed(curr_seed)
torch.cuda.manual_seed_all(curr_seed)

for epoch in range(n_epochs):
  print(f"Epoch number {epoch+1}")

  total_training_loss = 0
  model.train()

  for batch in train_dl:
    batched_input_ids = batch[0].to(device)
    batched_attn_masks = batch[1].to(device)
    batched_labels = batch[2].to(device)

    model.zero_grad()
    outputs = model(
        batched_input_ids,
        token_type_ids=None,
        attention_mask=batched_attn_masks,
        labels=batched_labels
    )
    loss = outputs.loss
    total_training_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
    pbar.update(1)

  avg_training_loss = total_training_loss / len(train_dl)
  print(f'Average training loss : {avg_training_loss}')



  0%|          | 0/964 [00:00<?, ?it/s]

Epoch number 1
Average training loss : 0.31301111435803636
Epoch number 2
Average training loss : 0.23762769954611404
Epoch number 3
Average training loss : 0.14718542278925908
Epoch number 4
Average training loss : 0.09931519623513477


### Saving the model

In [54]:
output_dir = "./grammar-checker-shady-base"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./grammar-checker-shady-base/tokenizer_config.json',
 './grammar-checker-shady-base/special_tokens_map.json',
 './grammar-checker-shady-base/vocab.txt',
 './grammar-checker-shady-base/added_tokens.json')

### Evaluate the results on the test set

In [55]:
model1 = BertForSequenceClassification.from_pretrained("./grammar-checker-shady-base")
model1.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [56]:
from sklearn.metrics import accuracy_score

pb = tqdm(total = len(test_dl))
true_items = []
predicted_items = []

for batch in test_dl:
  with torch.no_grad():
    batched_input_ids = batch[0]
    batched_attn_masks = batch[1]
    batched_labels = batch[2]
    outputs = model1(
        input_ids = batched_input_ids,
        token_type_ids = None,
        attention_mask = batched_attn_masks,
    )
    logits = outputs.logits
    _,_,true_labels = batch
    preds = torch.argmax(logits,dim=-1)
    true_items.extend(true_labels)
    predicted_items.extend(preds)
    pb.update(1)
accuracy = accuracy_score(true_items,predicted_items)
print(f"Accuracy on the test set = {accuracy*100} %")

  0%|          | 0/27 [00:00<?, ?it/s]

Accuracy on the test set = 82.00934579439252 %
