In [None]:
import pandas as pd
import ast

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json

with open('/content/drive/My Drive/MedNLI_Clinical_Inference/mli_train_v1.jsonl', 'r') as json_file:
    json_list_train = list(json_file)

with open('/content/drive/My Drive/MedNLI_Clinical_Inference/mli_dev_v1.jsonl', 'r') as json_file:
    json_list_val = list(json_file)

with open('/content/drive/My Drive/MedNLI_Clinical_Inference/mli_test_v1.jsonl', 'r') as json_file:
    json_list_test = list(json_file)

In [None]:
sent1_train = []
sent2_train = []
gl_train = []

for i in range(len(json_list_train)):
    a = ast.literal_eval(json_list_train[i])
    sent1_train.append(a['sentence1'])
    sent2_train.append(a['sentence2'])
    gl_train.append(a['gold_label'])

sent1_val = []
sent2_val = []
gl_val = []

for i in range(len(json_list_val)):
    a = ast.literal_eval(json_list_val[i])
    sent1_val.append(a['sentence1'])
    sent2_val.append(a['sentence2'])
    gl_val.append(a['gold_label'])

sent1_test = []
sent2_test = []
gl_test = []

for i in range(len(json_list_test)):
    a = ast.literal_eval(json_list_test[i])
    sent1_test.append(a['sentence1'])
    sent2_test.append(a['sentence2'])
    gl_test.append(a['gold_label'])

In [None]:
data_train = pd.DataFrame()
data_train['sentence1'] = sent1_train
data_train['sentence2'] = sent2_train
data_train['gold_label'] = gl_train

data_val = pd.DataFrame()
data_val['sentence1'] = sent1_val
data_val['sentence2'] = sent2_val
data_val['gold_label'] = gl_val

data_test = pd.DataFrame()
data_test['sentence1'] = sent1_test
data_test['sentence2'] = sent2_test
data_test['gold_label'] = gl_test

In [None]:
print(data_train.info())
print(data_val.info())
print(data_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11232 entries, 0 to 11231
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sentence1   11232 non-null  object
 1   sentence2   11232 non-null  object
 2   gold_label  11232 non-null  object
dtypes: object(3)
memory usage: 263.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1395 entries, 0 to 1394
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sentence1   1395 non-null   object
 1   sentence2   1395 non-null   object
 2   gold_label  1395 non-null   object
dtypes: object(3)
memory usage: 32.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1422 entries, 0 to 1421
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sentence1   1422 non-null   object
 1   sentence2   1422 non-null   object
 2   gold_label  1422 non-n

In [None]:
data_train['gold_label'].value_counts(normalize = True)

entailment       0.333333
contradiction    0.333333
neutral          0.333333
Name: gold_label, dtype: float64

In [None]:
data_train['combined_sent'] = data_train['sentence1']+ data_train['sentence2']
data_val['combined_sent'] = data_val['sentence1']+ data_val['sentence2']
data_test['combined_sent'] = data_test['sentence1']+ data_test['sentence2']

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data_train['gold_label'] = le.fit_transform(data_train['gold_label'])
data_train['gold_label'] = data_train['gold_label'].astype('int64')

data_val['gold_label'] = le.fit_transform(data_val['gold_label'])
data_val['gold_label'] = data_val['gold_label'].astype('int64')

data_test['gold_label'] = le.fit_transform(data_test['gold_label'])
data_test['gold_label'] = data_test['gold_label'].astype('int64')


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import time
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [None]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 5

In [None]:
train_texts = data_train.iloc[:]['combined_sent'].values
train_labels = data_train.iloc[:]['gold_label'].values

valid_texts = data_val.iloc[:]['combined_sent'].values
valid_labels = data_val.iloc[:]['gold_label'].values


test_texts = data_test.iloc[:]['combined_sent'].values
test_labels = data_test.iloc[:]['gold_label'].values

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [None]:
class MedNLIDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = MedNLIDataset(train_encodings, train_labels)
valid_dataset = MedNLIDataset(valid_encodings, valid_labels)
test_dataset = MedNLIDataset(test_encodings, test_labels)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=0.0001)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [None]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs['loss'], outputs['logits']

            _, predicted_labels = torch.max(logits, 1)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    model.train()

    for batch_idx, batch in enumerate(train_loader):

        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']

        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### Logging
        if not batch_idx % 30:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')

    model.eval()

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0005 | Batch 0000/0351 | Loss: 1.1013
Epoch: 0001/0005 | Batch 0030/0351 | Loss: 1.0307
Epoch: 0001/0005 | Batch 0060/0351 | Loss: 0.8365
Epoch: 0001/0005 | Batch 0090/0351 | Loss: 0.7707
Epoch: 0001/0005 | Batch 0120/0351 | Loss: 0.7100
Epoch: 0001/0005 | Batch 0150/0351 | Loss: 0.5918
Epoch: 0001/0005 | Batch 0180/0351 | Loss: 0.9455
Epoch: 0001/0005 | Batch 0210/0351 | Loss: 0.7963
Epoch: 0001/0005 | Batch 0240/0351 | Loss: 0.7874
Epoch: 0001/0005 | Batch 0270/0351 | Loss: 0.6973
Epoch: 0001/0005 | Batch 0300/0351 | Loss: 0.6928
Epoch: 0001/0005 | Batch 0330/0351 | Loss: 0.8079
training accuracy: 77.52%
valid accuracy: 71.90%
Time elapsed: 9.29 min
Epoch: 0002/0005 | Batch 0000/0351 | Loss: 0.5562
Epoch: 0002/0005 | Batch 0030/0351 | Loss: 0.8095
Epoch: 0002/0005 | Batch 0060/0351 | Loss: 0.4443
Epoch: 0002/0005 | Batch 0090/0351 | Loss: 0.5540
Epoch: 0002/0005 | Batch 0120/0351 | Loss: 0.4853
Epoch: 0002/0005 | Batch 0150/0351 | Loss: 0.4161
Epoch: 0002/0005 | Batch 018