In [1]:
import os
import wget
import time
import yaml
import glob
import torch
import random
import inspect
import logging
import datetime
import argparse
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import AdamW, GPT2Config, get_linear_schedule_with_warmup
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, WEIGHTS_NAME, CONFIG_NAME

from language_modeling import LMDataset, LMProcessor
from utils import read_yaml, set_seed, format_time, filter_args, get_device, save, check_folder, save_yaml
from processors import DataProcessor, ModelProcessor
from reporting import Report
from dataset import Dataset

In [2]:
yaml_file = 'fine_tuning_template.yml'

In [3]:
parameters = read_yaml(yaml_file)
check_folder(parameters['output_dir'])
save_yaml(parameters, os.path.join(parameters['output_dir'], 'config.yml'))
logging.basicConfig(filename=os.path.join(parameters['output_dir'], parameters['log_file']), filemode='w+', level=logging.INFO)
logging.info("Parameters fetched.")


In [4]:
logging.info("Setting seed for reproductibility...") 
set_seed(parameters['seed'])
logging.info("\tDone.")

logging.info("Set and retrieve the device on which to run...")
device = get_device()
task = parameters['task'].lower()
logging.info("\tDone.")


There are 0 GPU(s) available.
No GPU available, using the CPU instead.


In [5]:
logging.info("Instanciating dataset and data processor...")
if task in ['language_modeling']:
    data = LMDataset(task, parameters['dataset_name'].lower(), dataset_dir=parameters['dataset_dir'])
    processor = LMProcessor()
logging.info("\tDone.")



In [6]:
logging.info("Fetching data (training + validation) and parameters...")
data._fetch_dataset()
for set_type in ['train', 'dev']:
    data.process_dataset(set_type)
if parameters['do_test']:
    data.process_dataset('test')
logging.info("\tDone.")

logging.info("Fetching pre-trained GPT-2 model: {} and Tokenizer: {} for the task: {}...".format(parameters['pretrained_model'],
                                                                                                parameters['pretrained_tokenizer'],
                                                                                                parameters['task']))


100%|██████████| 135/135 [00:00<00:00, 403011.42it/s]
100%|██████████| 135/135 [00:00<00:00, 295373.52it/s]
100%|██████████| 176/176 [00:00<00:00, 430435.86it/s]
100%|██████████| 173/173 [00:00<00:00, 433720.62it/s]
100%|██████████| 177/177 [00:00<00:00, 356234.07it/s]
100%|██████████| 216/216 [00:00<00:00, 440004.69it/s]
100%|██████████| 196/196 [00:00<00:00, 345123.25it/s]
100%|██████████| 145/145 [00:00<00:00, 283135.05it/s]
100%|██████████| 207/207 [00:00<00:00, 533305.24it/s]
100%|██████████| 135/135 [00:00<00:00, 123066.95it/s]
100%|██████████| 135/135 [00:00<00:00, 416959.53it/s]
100%|██████████| 176/176 [00:00<00:00, 324197.41it/s]
100%|██████████| 173/173 [00:00<00:00, 554327.42it/s]
100%|██████████| 177/177 [00:00<00:00, 294483.07it/s]
100%|██████████| 216/216 [00:00<00:00, 378433.44it/s]
100%|██████████| 196/196 [00:00<00:00, 389613.07it/s]
100%|██████████| 145/145 [00:00<00:00, 366149.36it/s]
100%|██████████| 207/207 [00:00<00:00, 287490.37it/s]
100%|██████████| 135/135 [00

Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tok




In [7]:
if task in ['language_modeling']:
    model = GPT2LMHeadModel.from_pretrained(
                parameters['pretrained_model'],
                output_attentions=parameters['output_attentions'], # Whether the model returns attentions weights.
                output_hidden_states=parameters['output_hidden_states'], # Whether the model returns all hidden-states.
    )
tokenizer = GPT2Tokenizer.from_pretrained(parameters['pretrained_tokenizer'])
model.to(device)
logging.info("\tDone.")



In [8]:
logging.info("Get input examples...")
train_examples = processor.get_train_examples(data)
dev_examples = processor.get_dev_examples(data)
if parameters['do_test']:
    test_examples = processor.get_test_examples(data)
logging.info("\tDone.")



In [9]:
 parameters['max_length']

64

In [10]:
logging.info("Get input features...")
train_features = processor.convert_examples_to_features(train_examples, parameters['max_length'], tokenizer) 
dev_features = processor.convert_examples_to_features(dev_examples, parameters['max_length'], tokenizer)
if parameters['do_test']:
    test_features = processor.convert_examples_to_features(test_examples, parameters['max_length'], tokenizer) 
logging.info("\tDone.")
    


100%|██████████| 1560/1560 [00:06<00:00, 236.11it/s]
100%|██████████| 1560/1560 [00:06<00:00, 232.70it/s]
100%|██████████| 1560/1560 [00:06<00:00, 239.02it/s]


In [11]:
logging.info("Creating data loaders...")
train_dataloader = processor.get_data_loader(train_features, 
                                                batch_size=parameters['batch_size'], 
                                                local_rank=parameters['local_rank'], 
                                                set_type='train')
dev_dataloader = processor.get_data_loader(dev_features, 
                                            batch_size=parameters['batch_size'], 
                                            local_rank=parameters['local_rank'], 
                                            set_type='dev')
if parameters['do_test']:
    test_dataloader = processor.get_data_loader(test_features, 
                                                batch_size=parameters['batch_size'], 
                                                local_rank=parameters['local_rank'], 
                                                set_type='test')
logging.info("\tDone.")



In [12]:
logging.info("Creating optimizer and learning rate scheduler...")
optimizer = AdamW(
                model.parameters(),
                lr=float(parameters['learning_rate']),
                eps=float(parameters['adam_epsilon'])
            )
total_steps = len(train_dataloader) * parameters['nb_epochs'] # Total number of training steps is [nb batches] x [nb epochs]. 
scheduler = get_linear_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=parameters['num_warmup_steps'],
                num_training_steps=total_steps
            )
logging.info("\tDone.")

logging.info("Fine-tuning the model.")
model_processor = ModelProcessor(model, optimizer, tokenizer, 
                                    scheduler, device, 
                                    parameters['metric_name'], 
                                    parameters['nb_epochs'],
                                    parameters['use_output_mask'])


In [14]:
len(train_dataloader)

49

In [13]:
training_stats = model_processor.train(train_dataloader, dev_dataloader, parameters['output_dir'])


0it [00:00, ?it/s]


Training...


5it [02:03, 24.80s/it]


KeyboardInterrupt: 

In [None]:
logging.info("Validation reports: ")
for stat in training_stats:
    logging.info(stat['report'])
test_accuracy, test_loss = None, None
if parameters['do_test']:
    logging.info("Evaluation report: ")
    test_accuracy, test_loss, test_time, report = model_processor.evaluate(test_dataloader) 
    logging.info(report)
logging.info("\tDone.")

logging.info("Saving fine-tuned model to {}...".format(os.path.join(parameters['output_dir'], 'fine_tuned')))
save(model, tokenizer, parameters['output_dir'], 'fine_tuned')
logging.info("\tDone.")

logging.info("Plotting training and validation losses...")
Report.plots_train_val_loss(training_stats, parameters['nb_epochs'], 
                            os.path.join(parameters['output_dir'], 'train_val_loss.png'), 
                            test_accuracy=test_accuracy, test_loss=test_loss)
logging.info("\tDone.")