In [1]:
!pip install transformers --quiet

In [2]:
!pip install torch



In [3]:
import pandas as pd
data = pd.read_csv('quora_preprocessed.csv')

In [4]:
import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from collections import defaultdict
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup


In [5]:
pretrained_model_name = 'bert-base-cased'
maxlen = 160
batch_size = 16
epochs = 1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

# Bert Tokenizer with example
example_text = 'I am doing the final module of NLP Interestship '

tokens = tokenizer.tokenize(example_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {example_text}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')


 Sentence: I am doing the final module of NLP Interestship 
   Tokens: ['I', 'am', 'doing', 'the', 'final', 'module', 'of', 'NL', '##P', 'Interest', '##ship']
Token IDs: [146, 1821, 1833, 1103, 1509, 13196, 1104, 21239, 2101, 17067, 6607]


In [7]:
data['processed_text'] = data['ques_lemmatized'].astype('str')

In [9]:
token_lens = []
for txt in data.processed_text:
    tokens = tokenizer.encode(txt, max_length=512, truncation = True)
    token_lens.append(len(tokens))

Dataset Class

In [27]:
class QuestionDataset(Dataset):
    
    def __init__(self, questions, targets, tokenizer, max_len):
        self.questions = questions
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.questions)
    def __getitem__(self, item):
        question = str(self.questions[item])
        target = self.targets[item]
        encoding = self.tokenizer.encode_plus(
          question,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True  
        )
        return {
          'question_text': question,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
      }

In [28]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data,test_size = 0.4,random_state = 42)
val_data, test_data = train_test_split(test_data,test_size = 0.6,random_state = 42)

In [29]:
def data_loader(data, tokenizer, max_len, batch_size):
    ds = QuestionDataset(
      questions = data.processed_text.to_numpy(),
      targets = data.target.to_numpy(),
      tokenizer = tokenizer,
      max_len = max_len
    )
    return DataLoader(
      ds,
      batch_size = batch_size,
      num_workers = 0
    )

In [30]:
train_data_loader = data_loader(train_data, tokenizer, maxlen, batch_size)
val_data_loader = data_loader(val_data, tokenizer, maxlen, batch_size)
test_data_loader = data_loader(test_data, tokenizer, maxlen, batch_size)

data = next(iter(train_data_loader))
data.keys()

print (len(train_data_loader))
print (len(val_data_loader))
print (len(test_data_loader))

48980
13062
19592




In [31]:
# Shape of the torch
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

torch.Size([16, 160])
torch.Size([16, 160])
torch.Size([16])


In [32]:
data

{'question_text': ['anyone ketorolac pain affective would say long',
  'troubleshoot laptop unable connect tv via hdmi',
  'wrong put child romantic relationship people think ’ okay prioritize relationship child',
  'teenager intelligent',
  'creative way fundraise',
  'solar system form',
  'happens black hole black hole collide',
  'depleted uranium slug',
  'nirav modi moving around without legal passport',
  'favorite historical cultural documentary',
  'life korean war south korean',
  'ever get urge kill someone something usually trigger',
  'stable hyundai i20 highway',
  'hispanic origin mean',
  'president lawyer better worse leading country president background',
  'unit speed velocity acceleration'],
 'input_ids': tensor([[  101,  2256,   180,  ...,     0,     0,     0],
         [  101, 16115, 27240,  ...,     0,     0,     0],
         [  101,  2488,  1508,  ...,     0,     0,     0],
         ...,
         [  101,  1117, 10224,  ...,     0,     0,     0],
         [  101,

#### BERT Model

In [58]:
class QuestionClassifier(nn.Module):
    def __init__(self, n_classes):
        super(QuestionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask,
          return_dict=False,
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [59]:
model = QuestionClassifier(2)
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [60]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

torch.Size([16, 160])
torch.Size([16, 160])


In [61]:
model

QuestionClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [62]:
model(input_ids, attention_mask)

tensor([[-0.4373,  0.0110],
        [-0.7145,  0.6295],
        [-0.3116,  0.0690],
        [-0.4024, -0.0821],
        [-0.5445,  0.0725],
        [-0.6174,  0.5174],
        [-0.3066,  0.2447],
        [-0.5847, -0.5347],
        [-0.5784,  0.4077],
        [-0.8514, -0.0776],
        [-0.8261, -0.2917],
        [-0.4238,  0.3963],
        [-0.1388,  0.3251],
        [-0.7823,  0.1641],
        [-0.6745,  0.5228],
        [-0.3572, -0.3875]], grad_fn=<AddmmBackward0>)

In [63]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)



#### Training Function

In [65]:
def training_func(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):  
    # Putting the model in the training mode
    model = model.train()
    
    losses = []
    correct_predictions = 0
    for dl in data_loader:
        input_ids = dl["input_ids"].to(device)
        attention_mask = dl["attention_mask"].to(device)
        targets = dl["targets"].to(device)
        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

#### Evaluation Function

In [66]:
def evaluate_model(model, data_loader, loss_fn, device, n_examples):
    
    # Putting the model in the Evaluation mode
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for dl in data_loader:
            input_ids = dl["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
              input_ids=input_ids,
              attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

#### Training and Evaluating

In [73]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 30)
    train_acc, train_loss = training_func(
      model,
      train_data_loader,
      loss_fn,
      optimizer,
      device,
      scheduler,
      len(train_data)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = evaluate_model(
      model,
      val_data_loader,
      loss_fn,
      device,
      len(val_data)
    )
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/1
------------------------------




#### Predictions

In [None]:
def predictions(model, data_loader):
    model = model.eval()
    question_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        for dl in data_loader:
            texts = dl["question_text"]
            input_ids = dl["input_ids"].to(device)
            attention_mask = dl["attention_mask"].to(device)
            targets = dl["targets"].to(device)
            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            question_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)
            
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return question_texts, predictions, prediction_probs, real_values


#### Get Predictions

y_question_texts, y_pred, y_pred_probs, y_test = predictions(model,test_data_loader)

##### Printing first 5 predictions

In [None]:
i = 0
for text, pred, prob in zip(y_question_texts, y_pred, y_pred_probs):
    print(text, end = "   ")
    print(pred, end = "   ")
    print(prob)
    i+=1
    if i == 5:
        break

#### Classification Report

In [None]:
print(classification_report(y_test, y_pred))

#### F1 Score

In [None]:
# f1_score
print(f1_score(y_test, y_pred))