In [1]:
import torch
from torch.utils.tensorboard import SummaryWriter

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

print(torch.cuda.current_device())

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1080
0


In [2]:
import numpy as np
from tqdm.notebook import trange, tqdm
from transformers import BertTokenizer
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, IterableDataset
import random
import string

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


# Old dataset

In [3]:
class TenFingerDatasetOld(Dataset):
    def __init__(self, data_path, tokenizer, char2label, label2char,
                 max_seq_length=256, ignore_label_id=-100, \
                 pad_token=0, debug=False):
        self.samples = []
        self.char2label = char2label
        self.label2char = label2char
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.special_tokens_count = tokenizer.num_added_tokens()+1
        self.ignore_label_id = ignore_label_id
        self.pad_token = pad_token
        self.debug = debug
        
        with open(data_path) as f:
            for line in f.readlines():
                self.samples.append(line.strip())
                
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        return self.strToSample(self.samples[index])
    
    def char2finger(self, c):
        c2f = {
            'q':1, 'a':1, 'z':1,
            'w':2, 's':2, 'x':2,
            'e':3, 'd':3, 'c':3,
            'r':4, 'f':4, 'v':4,
            't':4, 'g':4, 'b':4,
            'y':5, 'h':5, 'n':5,
            'u':5, 'j':5, 'm':5,
            'i':6, 'k':6,
            'o':7, 'l':7,
            'p':8,
        }
        if c == ' ':
            return 1 #unused 0
        if c in c2f:
            return 1+c2f[c] #unused 1 - 8
        return 10 #unused 9
    
    def strToSample(self, content):
        tokens = content.split()
        #we randomly select the start index of typing
        #and give 0 more chance
        typing_start = random.choice(
            list(range(len(tokens)))+[0]*2)
        #the pre context of a sample
        pre_tokens = tokens[:typing_start]
        pre_tokens = tokenizer.tokenize(' '.join(pre_tokens))

        typing_text = ' '.join(tokens[typing_start:])
        typing_seq = [self.char2finger(c) for c in typing_text]

        #if typing seq is longer than max seq
        if len(typing_seq) > self.max_seq_length - self.special_tokens_count:
            typing_text = typing_text[:(self.max_seq_length - self.special_tokens_count)]
            typing_text = ' '.join(typing_text.split()[:-1])
            typing_seq = [self.char2finger(c) for c in typing_text]
            pre_tokens = []

        #else if typing+token is longer than max seq
        extra = len(pre_tokens)+len(typing_seq)-\
                (self.max_seq_length - self.special_tokens_count)
        if extra > 0:
            pre_tokens = pre_tokens[extra:]

        # The sample format:
        # [precontext] what is your [typing] k e y
        # [CLS] token_id token_id token_id [SEP] finger_id finger_id finger_id [SEP]

        pre_ids = self.tokenizer.convert_tokens_to_ids(['[CLS]']+pre_tokens+['[SEP]'])
        input_ids = pre_ids+typing_seq+self.tokenizer.convert_tokens_to_ids(['[SEP]'])

        label_ids = len(pre_ids)*[self.ignore_label_id]+\
                [self.char2label(c) for c in typing_text]+\
                [self.ignore_label_id]

        segment_ids = len(pre_ids)*[0]+(len(typing_text)+1)*[1]
        input_mask = len(label_ids)*[1]

        padding_len = self.max_seq_length - len(input_ids)
        input_ids += [self.pad_token]*padding_len
        input_mask += [self.pad_token]*padding_len
        segment_ids += [self.pad_token]*padding_len
        label_ids += [self.ignore_label_id]*padding_len

        if self.debug:
            print('typing text: %s' % typing_text)
            print("tokens: ", " ".join([str(x) for x in pre_tokens]))
            print("pre_ids: ", " ".join([str(x) for x in pre_ids]))
            print("input_ids: ", " ".join([str(x) for x in input_ids]))
            print("input_mask: ", " ".join([str(x) for x in input_mask]))
            print("segment_ids: ", " ".join([str(x) for x in segment_ids]))
            print("label_ids: ", " ".join([str(x) for x in label_ids]))

        assert len(input_ids) == self.max_seq_length
        assert len(input_mask) == self.max_seq_length
        assert len(segment_ids) == self.max_seq_length
        assert len(label_ids) == self.max_seq_length

        return input_ids, input_mask, segment_ids, label_ids

In [5]:
def collate(batch):
    input_ids, input_mask, segment_ids, label_ids = zip(*batch)
    # Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
    input_ids = torch.LongTensor(input_ids)
    input_mask = torch.LongTensor(input_mask)
    segment_ids = torch.LongTensor(segment_ids)
    label_ids = torch.LongTensor(label_ids)
    return (input_ids, input_mask, segment_ids, label_ids)

# **Load** the Dataset

In [6]:
from torch.utils.data import DataLoader
from tqdm.notebook import trange, tqdm

def char2label(ch):
  c2l = {c: i for i, c in enumerate(string.ascii_lowercase+' ')}
  if ch in c2l:
    return c2l[ch]
  else:
    return len(c2l)

def label2char(ii):
  l2c = {i: c for i, c in enumerate(string.ascii_lowercase+' ')}
  if ii in l2c:
    return l2c[ii]
  else:
    return '*'

train_dataset = TenFingerDatasetOld(
    data_path="data/yelpamazon.txt",
    char2label=char2label, label2char=label2char, tokenizer=tokenizer)

test_dataset = TenFingerDatasetOld(
    data_path="data/testmovie.txt",
    char2label=char2label, label2char=label2char, tokenizer=tokenizer)



# Train & Validate


In [7]:
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    BertForTokenClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

prepath = 'Models/local/checkpoint-730000/'

config = AutoConfig.from_pretrained(
        prepath,
        num_labels=len(string.ascii_lowercase+' ')+1,
        cache_dir=None,
    )

model = BertForTokenClassification.from_pretrained(
    prepath,
    config=config,)

model.to(device)

import os
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [8]:
from seqeval.metrics import f1_score, precision_score, recall_score

def evaluate(model, pad_token_label_id, num_workers=0, batch_size=24, prefix=""):
    test_data = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True,
                       num_workers=num_workers, collate_fn=collate)
    # Eval!
    print("***** Running evaluation %s *****", prefix)
    print("  Num examples = %d", len(test_data))
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    model.eval()
    for batch in tqdm(test_data, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1],
                      "token_type_ids": batch[2], "labels": batch[3]}
            
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=2)

    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
    preds_list = [[] for _ in range(out_label_ids.shape[0])]

    for i in range(out_label_ids.shape[0]):
        for j in range(out_label_ids.shape[1]):
            if out_label_ids[i, j] != pad_token_label_id:
                out_label_list[i].append(label2char(out_label_ids[i][j]))
                preds_list[i].append(label2char(preds[i][j]))

    results = {
        "loss": eval_loss,
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }

    print("***** Eval results %s *****", prefix)
    for key in sorted(results.keys()):
        print("  %s = %s" % (key, str(results[key])))

    return results, preds_list

In [9]:
def train(model, log_dir, outdir_prefix='',
          num_workers=0, batch_size=28, pad_token_label_id=-100,
          gradient_accumulation_steps=1, 
          num_train_epochs=2.0, learning_rate=5e-5, model_name_or_path="",
          logging_steps=1000, testing_steps=10000, save_steps=10000):
  
  tb_writer = SummaryWriter(log_dir=log_dir)

  train_data = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, 
                        num_workers=num_workers, collate_fn=collate)

  t_total = len(train_data) // gradient_accumulation_steps * num_train_epochs
  no_decay = ["bias", "LayerNorm.weight"]
  optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

  optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-8)
  scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=5, num_training_steps=t_total)
  
      # Check if saved optimizer or scheduler states exist
  if os.path.isfile(os.path.join(model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(model_name_or_path, "scheduler.pt")
  ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(model_name_or_path, "scheduler.pt")))

  print("***** Running training *****")
  print("  Num examples = %d" % len(train_data))
  print("  Num Epochs = %d" % num_train_epochs)
  print("  Total optimization steps = %d" % t_total)

  global_step = 0
  epochs_trained = 0
  steps_trained_in_current_epoch = 0

  # Check if continuing training from a checkpoint
  if os.path.exists(model_name_or_path):
    # set global_step to gobal_step of last saved checkpoint from model path
      try:
        global_step = int(model_name_or_path.split("-")[-1].split("/")[0])
      except ValueError:
        global_step = 0
      epochs_trained = global_step // (len(train_data) // gradient_accumulation_steps)
      steps_trained_in_current_epoch = global_step % (len(train_data) // gradient_accumulation_steps)

      print("  Continuing training from checkpoint, will skip to saved global_step")
      print("  Continuing training from epoch %d" % epochs_trained)
      print("  Continuing training from global step %d" % global_step)
      print("  Will skip the first %d steps in the first epoch" % steps_trained_in_current_epoch)

  tr_loss, logging_loss = 0.0, 0.0
  model.zero_grad()
  train_iterator = trange(
        epochs_trained, int(num_train_epochs), desc="Epoch")
  
  t0 = time.time()
  for _ in train_iterator:
        epoch_iterator = tqdm(train_data, desc="Iteration")

        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], 
                      "token_type_ids": batch[2], "labels": batch[3]}
            
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps
            loss.backward()

            tr_loss += loss.item()


            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                
                if global_step % logging_steps == 0: 
                  tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step)
                  tb_writer.add_scalar("loss", (tr_loss - logging_loss) / logging_steps, global_step)
                  logging_loss = tr_loss
                
                if global_step % testing_steps == 0:
                    # Log metrics
                    # Only evaluate when single GPU otherwise metrics may not average well
                    torch.cuda.empty_cache()
                    print("eval step ", global_step)
                    results, _ = evaluate(model, pad_token_label_id,  
                                          num_workers=2,
                                          batch_size=batch_size)
                    for key, value in results.items():
                      tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    

                if global_step % save_steps == 0:
                    # Save model checkpoint to gDrive
                    output_dir = os.path.join(outdir_prefix, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    print("Saving optimizer and scheduler states to %s" % output_dir)

        avg_train_loss = tr_loss / len(train_data)
        training_time = format_time(time.time() - t0)
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

  tb_writer.close()
  return global_step, tr_loss / global_step

# Begin Train


In [None]:
torch.cuda.empty_cache()
logname = 'yelpamazon_lr0.3/'
log_dir = 'logs/'+logname
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
global_step, tr_loss = train(model, log_dir=log_dir, num_workers=2,
                             batch_size=12, learning_rate=3e-5,
                             gradient_accumulation_steps=2,
                             logging_steps=100, testing_steps=10000, save_steps=10000,
                             model_name_or_path=prepath,
                             outdir_prefix='Models/local')
print(" global_step = %s, average loss = %s" % (global_step, tr_loss) )