In [31]:
import torch 
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm_notebook, trange
import os
from pytorch_transformers import XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count
from tools import *
import examples_to_features

# if you want to have more information on what's happening, activate the logger as follows
import logging

logging.basicConfig(level=logging.INFO)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
# the input data dir. should contain the .tsv files (or the other data files) for the task
DATA_DIR = "data/"

# Model Selection 'xlnet-large-cased'
XLNET_MODEL = 'xlnet-large-cased'

# The name of the task to train. I'm going t bame this 'yelp'
TASK_NAME = 'yelp'

# the output directory where the fine-tuned model and the checkpoints would be written
OUTPUT_DIR = f'outputs/{TASK_NAME}/'

# The directory where the evaluation reports will be written to 
REPORTS_DIR = f'reports/{TASK_NAME}_evaluation_report/'

# this is where XLNet will look for pre-trained models to load the parameters from 
CACHE_DIR = 'cache/'

# The maximum total input sequence length after WordPiece tokenization
# Sequences shorter than this will be truncated, and sequences shorter than this will be padded
MAX_SEQ_LENGTH = 128

TRAIN_BATCH_SIZE = 24
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"

In [40]:
output_mode = OUTPUT_MODE
cache_dir = CACHE_DIR

In [41]:
if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
        REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
        os.makedirs(REPORTS_DIR)
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)

In [42]:
if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(OUTPUT_DIR))
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [43]:
# Use our BinaryClassificationProcessor to load in the data and get everything ready for tokenization step
processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples(DATA_DIR)
train_examples_len = len(train_examples)


In [44]:
label_list = processor.get_labels() # [0,1] for binary classification
num_labels = len(label_list)

In [45]:
num_train_optimization_steps = int(
    train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

In [46]:
# load pre-trained model tokenizeer (vocabulary)
tokenzer = XLNetTokenizer.from_pretrained('xlnet-large-cased', do_lower_case=False)

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model not found in cache, downloading to /tmp/tmp2u53rioy
100%|██████████| 798011/798011 [00:07<00:00, 102529.70B/s]
INFO:pytorch_transformers.file_utils:copying /tmp/tmp2u53rioy to cache at /home/achintya/.cache/torch/pytorch_transformers/5b125ba222ff82664771f63cd8fac9696c24b403fc1ab720d537fe2ceaaf0576.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8
INFO:pytorch_transformers.file_utils:creating metadata file for /home/achintya/.cache/torch/pytorch_transformers/5b125ba222ff82664771f63cd8fac9696c24b403fc1ab720d537fe2ceaaf0576.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8
INFO:pytorch_transformers.file_utils:removing temp file /tmp/tmp2u53rioy
INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model from cache at /home/achintya/.cache/torch/pytorch_transform

In [47]:
label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenzer, OUTPUT_MODE) for example in train_examples]

In [11]:
process_count = 1
if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm_notebook(p.imap(examples_to_features.convert_example_to_feature, train_examples_for_processing), total=train_examples_len))


Preparing to convert 140000 examples..
Spawning 1 processes..


HBox(children=(IntProgress(value=0, max=140000), HTML(value='')))




In [12]:
with open(DATA_DIR + "train_features.pkl", "wb") as f:
    pickle.dump(train_features, f)

In [48]:
with open(DATA_DIR+'train_features.pkl', 'rb') as f:
    train_features = pickle.load(f)

 # Fine Tuning XLNet (at last)

In [None]:
# Load pre-trained model (weights)
model = XLNetForSequenceClassification.from_pretrained(XLNET_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)
#model = XLNetForSequenceClassification.from_pretrained(CACHE_DIR + 'cased_base_xlnet_pytorch.tar.gz', cache_dir=CACHE_DIR, num_labels=num_labels)

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json not found in cache, downloading to /tmp/tmpgthyygsw
100%|██████████| 467/467 [00:00<00:00, 103048.19B/s]
INFO:pytorch_transformers.file_utils:copying /tmp/tmpgthyygsw to cache at /home/achintya/.cache/torch/pytorch_transformers/df92a75c0ebbeb195065fe16fafa54ccd72e8362692cca884303a56788bd4bfc.acec4843da33f770b626225f5baf090adb2b85df67fe0c1619072f54ac439f54
INFO:pytorch_transformers.file_utils:creating metadata file for /home/achintya/.cache/torch/pytorch_transformers/df92a75c0ebbeb195065fe16fafa54ccd72e8362692cca884303a56788bd4bfc.acec4843da33f770b626225f5baf090adb2b85df67fe0c1619072f54ac439f54
INFO:pytorch_transformers.file_utils:removing temp file /tmp/tmpgthyygsw
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json from cache at /home/achintya/.cache/torch/pytorch_transfo

In [16]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediat

In [17]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

In [18]:
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_PROPORTION, t_total=num_train_optimization_steps)

In [19]:
global_steps = 0
nb_tr_steps = 0
tr_loss = 0

In [20]:
logger.info("***** Running training *****")
logger.info("  Num examples = %d", train_examples_len)
logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
logger.info("  Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

if OUTPUT_MODE == "classification":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)


INFO:root:***** Running training *****
INFO:root:  Num examples = 140000
INFO:root:  Batch size = 24
INFO:root:  Num steps = 5833


In [21]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [None]:
model.train()
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        logits = model(input_ids, segment_ids, input_mask, labels=None)

        if OUTPUT_MODE == "classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif OUTPUT_MODE == "regression":
            loss_fct = MSELoss()
            loss = loss_fct(logits.view(-1), label_ids.view(-1))

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        print("\r%f" % loss, end='')
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            scheduler.step()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1