In [2]:

from torch.utils.data import Dataset

import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
# from transformers import DistilBertTokenizer, DistilBertModel,DistilBertForSequenceClassification
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report



In [3]:
f_train = open('train5.txt', 'r+')
f_test = open('test.txt', 'r+')

train = pd.DataFrame(f_train.readlines(), columns = ['question'])
test = pd.DataFrame(f_test.readlines(), columns = ['question'])

In [4]:
train['qType'] = train.question.apply(lambda x: x.split(' ', 1)[0])
train['question'] = train.question.apply(lambda x: x.split(' ', 1)[1])
train['coarse'] = train.qType.apply(lambda x: x.split(':')[0])
test['qType'] = test.question.apply(lambda x: x.split(' ', 1)[0])
test['question'] = test.question.apply(lambda x: x.split(' ', 1)[1])
test['coarse'] = test.qType.apply(lambda x: x.split(':')[0])

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(pd.Series(train['coarse'].tolist() + test['coarse'].tolist()).values)
train['coarse'] = le.transform(train['coarse'].values)
test['coarse'] = le.transform(test['coarse'].values)

In [16]:
class FeatureDataset(Dataset):
    def __init__(self, data):
        
        question = data.loc[:, 'question']
        
        
        label = data.loc[:, 'coarse'].values
        self.label = torch.tensor(label).long()
        

#         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        
        tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], question))
        tokens_ids = list(map(tokenizer.convert_tokens_to_ids, tokens))
        tokens_ids = pad_sequences(tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
        
        masks = [[float(i > 0) for i in ii] for ii in tokens_ids]
        self.masks = torch.tensor(masks)


        self.question = torch.tensor(tokens_ids)
        
        
    
    def __len__(self):
        return len(self.question)
    
    
    def __getitem__(self, idx):
            return self.question[idx], self.masks[idx], self.label[idx]

In [17]:
train_set = FeatureDataset(train)
test_set = FeatureDataset(test)

In [10]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertClassifier, self).__init__()
        
#         self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 6)
#         self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, output_all_encoded_layers=False)
#         pooled_output = self.bert(tokens)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
#         proba = self.sigmoid(linear_output)
        return linear_output

In [17]:
train['coarse']

0       1
1       2
2       1
3       2
4       0
       ..
5447    2
5448    2
5449    5
5450    5
5451    2
Name: coarse, Length: 5452, dtype: int64

In [18]:
torch.cuda.empty_cache()
batch_size = 1


validation_split = .2
shuffle_dataset = True
random_seed= 42

# train_sampler = torch.utils.data.RandomSampler(train_set)
# train_dataloader = torch.utils.data.DataLoader(train_set, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)


dataset_size = len(train_set)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
valid_sampler = torch.utils.data.SubsetRandomSampler(val_indices)

train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                                sampler=valid_sampler)

print(len(train_dataloader))
print(len(validation_loader))
print(len(test_dataloader))

4362
1090
500


In [None]:
EPOCHS = 10
bert_clf = BertClassifier()
bert_clf = bert_clf.cuda()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)
accuracy= 0
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        token_ids , masks, labels = token_ids.cuda(), masks.cuda(), labels.cuda()
        probas = bert_clf(token_ids, masks)
        loss_func = nn.CrossEntropyLoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        if step_num%100==0:
            print('Epoch: ', epoch_num + 1)
            print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_set) / batch_size, train_loss / (step_num + 1)))
    #save the model only if the accuracy on the test set was improved
    bert_clf.eval()
    predictions = []
    for step_num, batch_data in enumerate(validation_loader):
        token_ids, masks, true_label = tuple(t for t in batch_data)
        token_ids , masks, labels = token_ids.cuda(), masks.cuda(), labels.cuda()
        probs = bert_clf(token_ids, masks)
        probs = probs.detach().cpu().numpy()
        pred_label = np.argmax(probs)
        predictions.append(pred_label==true_label.cpu().numpy())
    if sum(predictions)/len(predictions)>accuracy:
        #update accuracy and save the model
        accuracy = sum(predictions)/len(predictions)
        print('accuracy was improved')
        print('accuracy= {}'.format(accuracy))
        torch.save(bert_clf.state_dict(), 'bert_clf.pt')
    else:
        print('accuracy was not improved')
# torch.save(bert_clf.state_dict(), 'bert_clf.pt')



In [11]:
# load model
bert_clf = BertClassifier()
bert_clf.load_state_dict(torch.load('bert_clf.pt'))
bert_clf.cuda()

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
    

In [25]:
predictions = []

for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, true_label = tuple(t for t in batch_data)
        token_ids , masks, true_label = token_ids.cuda(), masks.cuda(), true_label.cuda()
        probs = bert_clf(token_ids, masks)
        probs = probs.detach().cpu().numpy()
        pred_label = np.argmax(probs)
        predictions.append(pred_label==true_label.cpu().numpy())
accuracy = sum(predictions)/len(predictions)
print(accuracy)

[0.962]
