Import all the necessary libraries.

# Identify question pairs that have the same intent


In [None]:
import logging
logging.basicConfig(level=logging.INFO)
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertTokenizerFast
from datasets import Dataset
import datasets
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

Load the data and check the split

In [None]:
data = pd.read_csv('../input/quora-question-pairs/train.csv.zip')
data = data.dropna()
total=len(data.index)
print("Fraction of non duplicate question pairs.")
print(len(data[data['is_duplicate']==0].index)/total)


Fraction of non duplicate question pairs.
0.630799407351708


Split the data and create Dataset dict. Dataset dict is necessary for us to applyt .withtransform described later

In [None]:
train, validation = train_test_split(data, test_size=0.2,random_state=42)
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validation)
ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

Check the dataset dict

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate', '__index_level_0__'],
        num_rows: 323429
    })
    validation: Dataset({
        features: ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate', '__index_level_0__'],
        num_rows: 80858
    })
})

Load bert default tokenizer

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Check the tokenizer

In [None]:
train_encodings = tokenizer(ds['train'][:2]['question1'], ds['train'][:2]['question2'], max_length=128, truncation=True)

See the output

In [None]:
train_encodings

{'input_ids': [[101, 2634, 1024, 2054, 2024, 3105, 7047, 1998, 2925, 7047, 2005, 2659, 1039, 21600, 2050, 2030, 7665, 7017, 3330, 2493, 1999, 2634, 1006, 1019, 1011, 1020, 1007, 1029, 102, 5841, 1998, 10922, 1999, 2634, 1024, 1045, 2572, 2747, 1999, 2026, 2353, 2095, 1997, 3330, 1998, 1045, 2031, 1037, 1039, 21600, 2050, 1997, 1021, 1012, 1020, 2007, 2028, 2067, 21197, 1006, 2044, 1018, 13609, 2015, 1007, 1012, 1045, 2031, 2787, 2008, 1045, 2215, 2000, 6148, 1037, 3105, 1999, 1037, 3813, 2066, 14163, 13201, 2030, 2079, 15038, 1012, 2029, 5724, 2003, 2488, 1998, 2054, 2024, 1996, 3316, 2008, 1045, 2064, 6611, 2005, 1999, 2344, 2000, 2131, 1037, 11519, 3105, 1029, 102], [101, 2029, 2003, 2088, 1005, 1055, 2190, 6298, 3185, 1029, 102, 2054, 2003, 1996, 2190, 6298, 3185, 1997, 2035, 2051, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

Decode the an encoder token collection

In [None]:
tokenizer.decode(train_encodings["input_ids"][1])

"[CLS] which is world's best romantic movie? [SEP] what is the best romantic movie of all time? [SEP]"

This function will take in a batch of data and return the tranform the data before returning the transformed data. This acts like a virtual dataset , which gets formed only when we access certain elements. The elements we access forms the batch. The batch will be defined by the user for example prepared_ds[:2] means accessing first two elements of the virtual dataset.

In [None]:
def transform(batch):
    inputs = tokenizer(list(zip(batch['question1'], batch['question2'])), padding='max_length', max_length=128, truncation=True)
    inputs['labels'] = batch['is_duplicate']
    return inputs
prepared_ds = ds.with_transform(transform)

Testing the virtual dataset.

In [None]:
tokenizer.decode(prepared_ds['train'][1000]["input_ids"])

"[CLS] why do they study human anatomy? [SEP] what's the best way to study human anatomy? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

Gpu/cpu handler

In [None]:
cuda=torch.device('cuda')
cpu=torch.device('cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device==cuda:
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0)

The model.forward requires inputs of batch of data to be in the form of datapoints as rows stacked one top of another. This collate_fn does the online tranformation whenever dataloader asks for a batch of data.

In [None]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(x['input_ids']) for x in batch]).to(device),
        'token_type_ids': torch.stack([torch.tensor(x['token_type_ids']) for x in batch]).to(device),
        'attention_mask': torch.stack([torch.tensor(x['attention_mask']) for x in batch]).to(device),
        'labels': torch.tensor([x['labels'] for x in batch]).to(device)
    }

Load only the needed data in the memory.

In [None]:

train_dataloader = DataLoader(prepared_ds['train'],  batch_size=32,collate_fn=collate_fn ,drop_last=True)
validation_dataloader= DataLoader(prepared_ds['train'],  batch_size=8,collate_fn=collate_fn ,drop_last=True)

Test the dataloader.

In [None]:
a=next(iter(train_dataloader))

Load the pretrained model. Here the model is Bert base.

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.save_pretrained('/kaggle/working/utsav/bert_uncased_quora')
model = BertForSequenceClassification.from_pretrained('/kaggle/working/utsav/bert_uncased_quora')

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
if device==cuda:
    model.cuda()

Test the model.

In [None]:
model(a['input_ids'], token_type_ids=a['token_type_ids'], attention_mask=a['attention_mask'], labels=a['labels'])

SequenceClassifierOutput(loss=tensor(0.6623, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2918, -0.2705],
        [ 0.3501, -0.2778],
        [ 0.3128, -0.2652],
        [ 0.3697, -0.2650],
        [ 0.3045, -0.2712],
        [ 0.2934, -0.2684],
        [ 0.4794, -0.2628],
        [ 0.3360, -0.2756],
        [ 0.3325, -0.2680],
        [ 0.2826, -0.2786],
        [ 0.3155, -0.2570],
        [ 0.3243, -0.2784],
        [ 0.3202, -0.2764],
        [ 0.3329, -0.2619],
        [ 0.3789, -0.2670],
        [ 0.3442, -0.2711],
        [ 0.3233, -0.2767],
        [ 0.3102, -0.2630],
        [ 0.3027, -0.2731],
        [ 0.2932, -0.2651],
        [ 0.2824, -0.2675],
        [ 0.3384, -0.2752],
        [ 0.3355, -0.2688],
        [ 0.2977, -0.2582],
        [ 0.3310, -0.2654],
        [ 0.3554, -0.2793],
        [ 0.2997, -0.2617],
        [ 0.3203, -0.2641],
        [ 0.3197, -0.2722],
        [ 0.3205, -0.2743],
        [ 0.3172, -0.2654],
        [ 0.3541, -0.2894]], devic

Group the parameters , so that we don't apply weight decay to these parameters.

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if all(nd not in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}]

Create the optimizer for bert.

In [None]:
%%capture
!pip install pytorch-pretrained-bert

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from pytorch_pretrained_bert import BertAdam
optimizer = BertAdam(optimizer_grouped_parameters,lr=2e-5,warmup=.1)

Helper function to check accuracy.

In [None]:

def accuracy(predicted_distribution, true_label):
    predicted_label=np.argmax(predicted_distribution, axis=1)
    return np.sum(predicted_label.flatten() == true_label.flatten()) / len(true_label.flatten())

Training loop.

In [None]:
def train(epochs = 2):
    for _ in trange(epochs, desc="Epoch"):
      cumulative_loss = 0
      number_of_batch = 0
      for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        output=model.forward(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = output.loss
        loss.backward()
        optimizer.step()
        cumulative_loss += loss.item()
        number_of_batch += 1
        if (step%100==0):
          print(f"Train loss: {cumulative_loss / number_of_batch}")
    model.save_pretrained('/kaggle/working/utsav/bert_uncased_quora')
model.train()
train()

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /usr/local/src/pytorch/torch/csrc/utils/python_arg_parser.cpp:1055.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Train loss: 0.6747491359710693
Train loss: 0.5472003678874214
Train loss: 0.5084767538813216
Train loss: 0.4887447274998573
Train loss: 0.4731966943812192
Train loss: 0.4625732322593411
Train loss: 0.4514341955871233
Train loss: 0.4432584316006061
Train loss: 0.4370120427060812
Train loss: 0.42960275362321193
Train loss: 0.4229158838460972
Train loss: 0.4164017536064151
Train loss: 0.4124621583162795
Train loss: 0.4087802881144451
Train loss: 0.40421793937980915
Train loss: 0.3998031539669996
Train loss: 0.39417326926421853
Train loss: 0.3910481544022067
Train loss: 0.3877770238778844
Train loss: 0.38428715650954415
Train loss: 0.3811478722302512
Train loss: 0.3775077939196634
Train loss: 0.3757044928167853
Train loss: 0.37371186858633193
Train loss: 0.3721031927438092
Train loss: 0.36941189318299533
Train loss: 0.36686243307631733
Train loss: 0.3649489373018326
Train loss: 0.3633448027183396
Train loss: 0.3616773231004444
Train loss: 0.3601237803781998
Train loss: 0.358325750202719
Tr

Epoch:  50%|█████     | 1/2 [1:10:33<1:10:33, 4233.26s/it]

Train loss: 0.5451766848564148
Train loss: 0.2346865550273716
Train loss: 0.2293884817195769
Train loss: 0.2304873500279216
Train loss: 0.23001869405594252
Train loss: 0.23059904216946717
Train loss: 0.2308927182225935
Train loss: 0.2294831455774212
Train loss: 0.2296299049991198
Train loss: 0.2273212332179259
Train loss: 0.22666803610461814
Train loss: 0.2259679102016567
Train loss: 0.2252525506505462
Train loss: 0.224660979513706
Train loss: 0.22249250435705784
Train loss: 0.2221410984271451
Train loss: 0.2201393289534581
Train loss: 0.21990146614317682
Train loss: 0.21899945686040223
Train loss: 0.21839111454212654
Train loss: 0.21744673843516493
Train loss: 0.21622279091963792
Train loss: 0.21633734377266087
Train loss: 0.21576271725106969
Train loss: 0.21603889704473636
Train loss: 0.21475114170737455
Train loss: 0.214042699504436
Train loss: 0.21326076294033783
Train loss: 0.21345242063200015
Train loss: 0.212816795967301
Train loss: 0.2128259799064884
Train loss: 0.2122712504906

Epoch: 100%|██████████| 2/2 [2:21:09<00:00, 4234.67s/it]


Load the model again for testing.

In [None]:
model = BertForSequenceClassification.from_pretrained('/kaggle/working/utsav/bert_uncased_quora')

In [None]:
model.cuda()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
model(a['input_ids'], token_type_ids=a['token_type_ids'], attention_mask=a['attention_mask'], labels=a['labels'])

SequenceClassifierOutput(loss=tensor(0.3572, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.5324, -4.3893],
        [-3.2461,  2.9924],
        [-0.6646,  0.2634],
        [ 3.8006, -4.4894],
        [-0.0651, -0.4878],
        [-2.0623,  1.7214],
        [ 5.3005, -5.5592],
        [ 3.5004, -4.6342],
        [ 0.7742, -1.4620],
        [ 5.3522, -5.6255],
        [-3.1062,  2.9500],
        [ 3.9453, -4.9249],
        [-1.8012,  1.4391],
        [ 3.1127, -4.1602],
        [-3.5179,  3.2688],
        [-0.8664,  0.4685],
        [-1.8550,  1.4805],
        [ 3.2760, -4.4044],
        [-1.7896,  1.4809],
        [ 2.9638, -4.0874],
        [ 0.5332, -1.0197],
        [-2.9439,  2.5944],
        [ 0.7045, -1.6593],
        [ 2.4276, -3.5577],
        [ 0.0999, -0.5319],
        [-2.0110,  1.6630],
        [ 3.1663, -4.3434],
        [ 1.3723, -2.1011],
        [-0.9945,  0.5945],
        [ 4.8769, -5.4918],
        [-1.1205,  0.7428],
        [ 1.3958, -2.3469]], devic

In [None]:
def validate():
  model.eval()
  cumulated_accuracy = 0
  number_of_batch= 0
  for batch in validation_dataloader:
    with torch.no_grad():
      output=model.forward(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
      logits = output.logits
    logits = logits.detach().cpu().numpy()
    label_ids = batch['labels'].to('cpu').numpy()
    batch_accuracy = accuracy(logits, label_ids)
    cumulated_accuracy += batch_accuracy
    number_of_batch += 1
  print(f"Validation Accuracy: {cumulated_accuracy / number_of_batch}")

In [None]:
model.eval()
validate()

Validation Accuracy: 0.9621209310378945
