In [1]:
!pip -q install transformers

[K     |████████████████████████████████| 1.9MB 19.1MB/s 
[K     |████████████████████████████████| 890kB 40.1MB/s 
[K     |████████████████████████████████| 3.2MB 43.8MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
# Gets the data

%%bash
rm -r sample_data
wget -q 'https://www.dropbox.com/s/52fiyn199kgk0iq/datasets_combined.pkl?dl=1' -O 'datasets_combined.pkl'
mkdir train
cd train
wget -q 'https://www.dropbox.com/s/cnw462g0oyo28i0/lcp_single_train.tsv?dl=1' -O 'lcp_single_train.tsv'
wget -q 'https://www.dropbox.com/s/y1yoq24hzqbe5bf/lcp_multi_train.tsv?dl=1' -O 'lcp_multi_train.tsv'
cd ..
mkdir trial
cd trial
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/trial/lcp_single_trial.tsv' -O 'lcp_single_trial.tsv'
wget -q 'https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/trial/lcp_multi_trial.tsv' -O 'lcp_multi_trial.tsv'
cd ..
mkdir test
cd test
wget -q 'https://www.dropbox.com/s/mjcwx9wawealjk8/lcp_single_test.tsv?dl=1' -O 'lcp_single_test.tsv'
wget -q 'https://www.dropbox.com/s/zil1h9xp7hrhkw1/lcp_multi_test.tsv?dl=1' -O 'lcp_multi_test.tsv'

In [3]:
import pickle
import numpy as np
import pandas as pd
import re
import spacy
import torch
from torch import nn
import torch.nn.functional as F
import os
import csv
import time
from scipy.stats import pearsonr
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import copy
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import BertModel, RobertaModel, BertTokenizer, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, random_split, DataLoader, IterableDataset, ConcatDataset
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score 
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForPreTraining
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
data = pd.read_csv('train/lcp_single_train.tsv',  delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

In [5]:
for i, token in enumerate(list(data.token)):
  if isinstance(token, float):
    data.drop([i], inplace = True)
    # print('Yes')

In [6]:
class Dataset():
    def __init__(self, df, batch_size = 32, max_len = 128):
        self.max_len = max_len
        self.sentences = list(df.sentence)
        self.tokens = list(df.token)
        self.encoded_tokens, self.X_lengths = self.one_hot_batch(self.encode_tokens(list(df.token))[0])
        
        self.encoded_tokens = torch.FloatTensor(self.encoded_tokens)
        self.X_lengths = torch.FloatTensor(self.X_lengths)
        
        self.complexity = torch.Tensor(list(df.complexity))
        self.batch_size = batch_size
        self.train_dataloader, self.val_dataloader = self.process_data()

    def one_hot_word(self, encoded_tokens):
        identity = torch.eye(26)
        res = torch.Tensor(identity[encoded_tokens[0] - 1]).view(1, 26)
        for each in (encoded_tokens)[1:]:
          if each == 0:
            res = torch.cat([res, torch.zeros(1, 26)])
          else:
            res = torch.cat([res, identity[each - 1].view(1, 26)])
        return res

    def one_hot_batch(self, encoded_tokens):
        X_lengths = self.get_lengths(encoded_tokens)
        final_res = self.one_hot_word(encoded_tokens[0]).view(1, 16, 26)
        for i in range(encoded_tokens.shape[0] - 1):
          one_hot_res = self.one_hot_word(encoded_tokens[i + 1])
          final_res = torch.cat([final_res, one_hot_res.view(1, 16, 26)], dim = 0)
        return final_res, X_lengths

    def get_lengths(self, x):
        X_lengths = []
        for i in range(len(x)):
            num = 0
            while (num != 16 and x[i][num] != 0):
                num += 1
            X_lengths.append(num)
        return X_lengths

    def encode_tokens(self, tokens):
        n_labels = 26
        text = 'abcdefghijklmnopqrstuvwxyz'
        chars = tuple(set(text))
        intrange = set(i + 1 for i in range(26))
        int2char = dict(zip(intrange, chars))
        char2int = {ch: ii for ii, ch in int2char.items()}
        encoded_list = []
        for token in tokens:
          token = token.lower()
          encoded = np.array([char2int[ch] for ch in token])
          encoded_list.append(torch.from_numpy(encoded))
          padded_list, mask = self.padding_tensor(encoded_list)
        return padded_list, mask

    def padding_tensor(self, sequences):
        num = len(sequences)
        max_len = max([s.size(0) for s in sequences])
        out_dims = (num, max_len)
        out_tensor = sequences[0].data.new(*out_dims).fill_(0)
        mask = sequences[0].data.new(*out_dims).fill_(0)
        for i, tensor in enumerate(sequences):
            length = tensor.size(0)
            out_tensor[i, :length] = tensor
            mask[i, :length] = 1
        return out_tensor, mask
    
      
    def process_data(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        input_ids, attention_masks = [], []
        for sentence, token in zip(self.sentences, self.tokens):
            sent = str(token).lower() + ' [SEP] ' +  str(sentence).lower() 
            encoded_dict = tokenizer.encode_plus(sent,
                                                    add_special_tokens=True,
                                                    max_length=self.max_len, 
                                                    padding='max_length', 
                                                    return_attention_mask = True,
                                                    return_tensors = 'pt', 
                                                    truncation = True)
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
        
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        return self.get_dataloaders(input_ids, attention_masks)
    
    def get_dataloaders(self, input_ids, attention_masks):
        input_ids, attention_masks, self.complexity, self.encoded_tokens, self.X_lengths = sklearn.utils.shuffle(input_ids, attention_masks, self.complexity, self.encoded_tokens, self.X_lengths, random_state=42)
        train_idx, test_idx, _, _ = train_test_split(range(input_ids.shape[0]), input_ids, test_size = 0.2, random_state = 42)

        training_data = TensorDataset(input_ids[train_idx], attention_masks[train_idx], self.complexity[train_idx], self.encoded_tokens[train_idx], self.X_lengths[train_idx])
        training_sampler = RandomSampler(training_data)
        training_dataloader = DataLoader(training_data, sampler=training_sampler, batch_size=self.batch_size)

        test_data = TensorDataset(input_ids[test_idx], attention_masks[test_idx], self.complexity[test_idx], self.encoded_tokens[test_idx], self.X_lengths[test_idx])
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.batch_size)

        return training_dataloader, test_dataloader

In [7]:
start_time = time.time()
dataset = Dataset(data, batch_size = 32)
print("Time taken: " + str((time.time() - start_time)/60))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Time taken: 6.892130124568939


In [8]:
class BERT_CharLSTM(torch.nn.Module):
    def __init__(self, BERT_in, n_hidden=100, n_layers=2, seq_length=16, n_linear=500, drop_prob=0.25, lr=0.001):
        
        super(BERT_CharLSTM, self).__init__()

        # CharLSTM
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_linear = n_linear
        self.seq_length = seq_length
        self.lr = lr

        self.chars = 'abcdefghijklmnopqrstuvwxyz'
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True, bidirectional = True)
        
        self.dropout = nn.Dropout(drop_prob)
      
        self.fc = nn.Linear(2*n_hidden*seq_length, n_linear, bias = True)

        # BERT
        self.embeddings = BertModel.from_pretrained('bert-base-uncased',  output_hidden_states = True)
        self.final = nn.Linear(BERT_in, n_linear, bias = True)
        self.dropout = nn.Dropout(drop_prob)
        self.relu = nn.ReLU()

        # Combined
        self.final_1 = nn.Linear(2 * n_linear, 1, bias = True)
        self.sigmoid = nn.Sigmoid()


    def init_hidden(self, batch_size):

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
        hidden = ((torch.zeros(2 * self.n_layers, batch_size, self.n_hidden)).to(device),
                  (torch.zeros(2 * self.n_layers, batch_size, self.n_hidden)).to(device))

        return hidden

    def forward(self, x, x_mask, X_char, X_char_lengths, batch_size, hidden):

        # CharLSTM
        X = X_char
        r_output, hidden = self.lstm(X, hidden)
        out = r_output
        out = self.dropout(out)
        out = out.contiguous()
        out = self.fc(out.view(out.shape[0], -1)) 
        out = self.relu(out)
        out = out.view(out.shape[0], -1) 

        # BERT
        embed = self.embeddings(x,x_mask)[1]
        y_pred = self.relu(self.final(self.dropout(embed)))
        y_pred = y_pred.view(y_pred.shape[0], -1) 
        
        # Combined
        final_inp = torch.cat([out, y_pred], dim = 1) 
        final_inp = self.final_1(final_inp)
        final_inp = final_inp.view(final_inp.shape[0])
        final_result = self.sigmoid(final_inp)

        return final_result

In [9]:
def save_metrics(save_path, epochs, model, optimizer, L1):

    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'epochs': epochs+1,
                  'L1': L1}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path, model, optimizer):
    try: 
        state_dict = torch.load(load_path, map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])
        optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    except: 
        state_dict = {}

    print(f'Model loaded from <== {load_path}')
    
    return state_dict.get('epochs', 0), state_dict.get('L1', 1000)

In [10]:
def evaluate(test_dataloader, model):
    model.eval()
    total_eval_accuracy=0
    y_preds = np.array([])
    y_test = np.array([])
    total_loss = 0
    criterion = nn.L1Loss()
    hidden_charlstm = model.init_hidden(batch_size = 32)
    for batch in test_dataloader:
        hidden_charlstm = model.init_hidden(batch_size = batch[0].to(device).shape[0])
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_X_char = batch[3].to(device)
        b_X_char_lengths = batch[4].to(device)
        with torch.no_grad():
            ypred = model(b_input_ids, b_input_mask, b_X_char, b_X_char_lengths.cpu(), batch_size = 32, hidden = hidden_charlstm)        

        ypred = ypred.to('cpu').numpy()
        b_labels = b_labels.to('cpu').numpy()

        y_preds = np.hstack((y_preds, ypred))
        y_test = np.hstack((y_test, b_labels))

    loss = np.mean(np.abs(y_preds-y_test))
    corr, _ = pearsonr(y_preds, y_test)
    return loss, y_preds, y_test, corr
 
def train(training_dataloader, validation_dataloader, model, filename, epochs = 4):
    total_steps = len(training_dataloader) * epochs
    optimizer = AdamW(model.parameters(), lr=2e-5, eps = 1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    
    criterion = nn.L1Loss()
    best_model = copy.deepcopy(model)

    hidden_charlstm = model.init_hidden(batch_size = 32)

    cur_epoch, best_l1 = load_metrics(filename, model, optimizer)
    for epoch_i in tqdm(range(0, epochs)):
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(training_dataloader):
            hidden_charlstm = model.init_hidden(batch_size = batch[0].to(device).shape[0])
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            b_X_char = batch[3].to(device)
            b_X_char_lengths = batch[4].to(device)
            outputs = model(b_input_ids, b_input_mask, b_X_char, b_X_char_lengths.cpu(), batch_size = 32, hidden = hidden_charlstm)
            loss = criterion(outputs, b_labels)
 
            if step%50 == 0:
                print(loss)
 
            total_train_loss += loss
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
 
        print()
        print(f'Total Train Loss = {total_train_loss}')
        print('#############    Validation Set Stats')
        l1_loss, _, _, corr = evaluate(validation_dataloader, model)
        print("  L1 loss: {}".format(l1_loss))
        print("  Pearson correlation: {}".format(corr))
 
        if l1_loss < best_l1:
            best_l1 = l1_loss
            save_metrics(filename, epoch_i, model, optimizer, l1_loss)
 

In [11]:
model = BERT_CharLSTM(768).to(device)
# print(model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [12]:
train(dataset.train_dataloader, dataset.val_dataloader, model, 'lcp_sigmoid.pt')

  0%|          | 0/4 [00:00<?, ?it/s]

Model loaded from <== lcp_sigmoid.pt
tensor(0.2300, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.1276, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.1099, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0675, device='cuda:0', grad_fn=<L1LossBackward>)

Total Train Loss = 16.540884017944336
#############    Validation Set Stats
  L1 loss: 0.07069815868540975
  Pearson correlation: 0.7702301196207733


 25%|██▌       | 1/4 [02:29<07:28, 149.56s/it]

Model saved to ==> lcp_sigmoid.pt
tensor(0.0725, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0652, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0584, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0603, device='cuda:0', grad_fn=<L1LossBackward>)

Total Train Loss = 12.180423736572266
#############    Validation Set Stats
  L1 loss: 0.06400496618793038
  Pearson correlation: 0.7847085065235135


 50%|█████     | 2/4 [05:03<05:01, 151.00s/it]

Model saved to ==> lcp_sigmoid.pt
tensor(0.0708, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0513, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0614, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0328, device='cuda:0', grad_fn=<L1LossBackward>)

Total Train Loss = 10.8983736038208
#############    Validation Set Stats
  L1 loss: 0.06337800334200419
  Pearson correlation: 0.7959902627887484


 75%|███████▌  | 3/4 [07:38<02:32, 152.12s/it]

Model saved to ==> lcp_sigmoid.pt
tensor(0.0621, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0545, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0630, device='cuda:0', grad_fn=<L1LossBackward>)
tensor(0.0611, device='cuda:0', grad_fn=<L1LossBackward>)

Total Train Loss = 10.083100318908691
#############    Validation Set Stats
  L1 loss: 0.06288751905453306
  Pearson correlation: 0.7965084786993971


100%|██████████| 4/4 [10:12<00:00, 153.15s/it]

Model saved to ==> lcp_sigmoid.pt





# **TEST**

In [13]:
test_data = pd.read_csv('/content/test/lcp_single_test.tsv',  delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

In [14]:
test_data.to_csv('test_original.csv')

In [15]:
class TestDataset():
    def __init__(self, df, batch_size = 32, max_len = 128):
        self.max_len = max_len
        self.sentences = list(df.sentence)
        self.tokens = list(df.token)
        self.encoded_tokens, self.X_lengths = self.one_hot_batch(self.encode_tokens(list(df.token))[0])
        
        self.encoded_tokens = torch.FloatTensor(self.encoded_tokens)
        self.X_lengths = torch.FloatTensor(self.X_lengths)
        
        self.batch_size = batch_size
        self.test_dataloader = self.process_data()

    def one_hot_word(self, encoded_tokens):
        identity = torch.eye(26)
        res = torch.Tensor(identity[encoded_tokens[0] - 1]).view(1, 26)
        for each in (encoded_tokens)[1:]:
          if each == 0:
            res = torch.cat([res, torch.zeros(1, 26)])
          else:
            res = torch.cat([res, identity[each - 1].view(1, 26)])
        return res

    def one_hot_batch(self, encoded_tokens):
        X_lengths = self.get_lengths(encoded_tokens)
        final_res = self.one_hot_word(encoded_tokens[0]).view(1, 16, 26)
        for i in range(encoded_tokens.shape[0] - 1):
          one_hot_res = self.one_hot_word(encoded_tokens[i + 1])
          final_res = torch.cat([final_res, one_hot_res.view(1, 16, 26)], dim = 0)
        return final_res, X_lengths

    def get_lengths(self, x):
        X_lengths = []
        for i in range(len(x)):
            num = 0
            while (num != 16 and x[i][num] != 0):
                num += 1
            X_lengths.append(num)
        return X_lengths

    def encode_tokens(self, tokens):
        n_labels = 26
        text = 'abcdefghijklmnopqrstuvwxyz'
        chars = tuple(set(text))
        intrange = set(i + 1 for i in range(26))
        int2char = dict(zip(intrange, chars))
        char2int = {ch: ii for ii, ch in int2char.items()}

        encoded_list = []
        for token in tokens:
          token = token.lower()
          encoded = np.array([char2int[ch] for ch in token])
          encoded_list.append(torch.from_numpy(encoded))
          padded_list, mask = self.padding_tensor(encoded_list)
        return padded_list, mask

    def padding_tensor(self, sequences):
        num = len(sequences)
        max_len = max([s.size(0) for s in sequences])
        out_dims = (num, max_len)
        out_tensor = sequences[0].data.new(*out_dims).fill_(0)
        mask = sequences[0].data.new(*out_dims).fill_(0)
        for i, tensor in enumerate(sequences):
            length = tensor.size(0)
            out_tensor[i, :length] = tensor
            mask[i, :length] = 1
        return out_tensor, mask
    
      
    def process_data(self):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        input_ids, attention_masks = [], []
        for sentence, token in zip(self.sentences, self.tokens):
            sent = str(token).lower() + ' [SEP] ' +  str(sentence).lower() 
            encoded_dict = tokenizer.encode_plus(sent,
                                                    add_special_tokens=True,
                                                    max_length=self.max_len, 
                                                    padding='max_length', 
                                                    return_attention_mask = True,
                                                    return_tensors = 'pt', 
                                                    truncation = True)
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
        
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        return self.get_dataloaders(input_ids, attention_masks)
    
    def get_dataloaders(self, input_ids, attention_masks):
        test_data = TensorDataset(input_ids, attention_masks, self.encoded_tokens, self.X_lengths)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.batch_size)
        return test_dataloader

In [16]:
for i, word in enumerate(list(test_data.token)):
  ''.join(e for e in word if e.isalnum())
  if len(word) > 16:
    test_data.token[i] = word[:16]

In [17]:
test_dataset = TestDataset(test_data, batch_size = 32)

In [18]:
def evaluate(test_dataloader, model):
    model.eval()
    y_preds = np.array([])
    total_loss = 0
    hidden_charlstm = model.init_hidden(batch_size = 32)
    for batch in test_dataloader:
        hidden_charlstm = model.init_hidden(batch_size = batch[0].to(device).shape[0])
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_X_char = batch[2].to(device)
        b_X_char_lengths = batch[3].to(device)
        with torch.no_grad():
            ypred = model(b_input_ids, b_input_mask, b_X_char, b_X_char_lengths.cpu(), batch_size = 32, hidden = hidden_charlstm)        
        ypred = ypred.to('cpu').numpy()

    return y_preds

In [20]:
model = BERT_CharLSTM(768).to(device)

In [21]:
state_dict = torch.load(f = '/content/lcp_sigmoid.pt', map_location=device)
model.load_state_dict(state_dict['model_state_dict'])

<All keys matched successfully>

In [22]:
FINAL_PREDS = evaluate(test_dataset.test_dataloader, model)

In [23]:
results_dataset = pd.DataFrame()

In [24]:
results_dataset['complexity'] = list(FINAL_PREDS)

In [25]:
results_dataset.to_csv('results_latest.csv')