In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import tensorflow
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import io
import os
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

Using TensorFlow backend.


In [3]:
## Set seed of randomization and working device
manual_seed = 77
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)

print(torch.cuda.get_device_name(0))

cuda
Tesla P100-PCIE-16GB


In [4]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 2.7MB/s 
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 77.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 53.7MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K    

In [0]:
from transformers import *

In [0]:
# Transformers has a unified API
# here we list models for 10 transformer architectures
# for the full list of available pretrained-models: go to https://huggingface.co/transformers/pretrained_models.html
#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          (CTRLModel,       CTRLTokenizer,       'ctrl'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]
         
# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]

## Data Preparation

In [0]:
# define a function for data preparation
def data_prepare(file_path, tokenizer, lab2ind, max_len = 32, mode = 'train'):
    '''
    file_path: the path to input file. 
                In train mode, the input must be a csv file that includes two columns where the first is text, and second column is label.
                The first row must be header of columns.

                In predict mode, the input must be a tsv file that includes only one column where the first is text.
                The first row must be header of column.

    lab2ind: dictionary of label classes
    tokenizer: BERT tokenizer
    max_len: maximal length of input sequence
    mode: train or predict
    '''
    # if we are in train mode, we will load two columns (i.e., text and label).
    if mode == 'train':
        # Use pandas to load dataset
        df = pd.read_csv(file_path, header=0, names=['content','label'])
        print("Data size ", df.shape)
        labels = df.label.values
        
        # Create sentence and label lists
        labels = [lab2ind[label] for label in labels] 
        print("Label is ", labels[0])
        
        # Convert data into torch tensors
        labels = torch.tensor(labels)

    # if we are in predict mode, we will load one column (i.e., text).
    elif mode == 'predict':
        df = pd.read_csv(file_path, header=0, names=['content'])
        print("Data size ", df.shape)
        # create placeholder
        labels = []
    else:
        print("the type of mode should be either 'train' or 'predict'. ")
        return
        
    # Create sentence and label lists
    content = df.content.values

    # We need to add a special token at the beginning for BERT to work properly.
    content = ["[CLS] " + text for text in content]

    # Import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.
    tokenized_texts = [tokenizer.tokenize(text) for text in content]

    
    # if the sequence is longer the maximal length, we truncate it to the pre-defined maximal length
    tokenized_texts = [ text[:max_len+1] for text in tokenized_texts]

    # We also need to add a special token at the end.
    tokenized_texts = [ text+['[SEP]'] for text in tokenized_texts]
    print ("Tokenize the first sentence:\n",tokenized_texts[0])
    
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    print ("Index numbers of the first sentence:\n",input_ids[0])

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
    input_ids = pad_sequences(input_ids, maxlen=max_len+2, dtype="long", truncating="post", padding="post", value=pad_ind)
    # input_ids = pad_sequences(input_ids, dtype="long", truncating="post", padding="post", value=pad_ind)
    print ("Index numbers of the first sentence after padding:\n",input_ids[0])

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for pad tokens
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, labels, masks

In [0]:
# eco_news_pd = pd.read_json('./drive/My Drive/datasetEconomyNews_PN.json')
# train, valid = train_test_split(eco_news_pd, test_size=0.2)

In [0]:
# Don't run this block!
#def json_data_prepare(dataset, tokenizer, lab2ind, mode = 'train'):
#    '''
#    file_path: the path to input file. 
#            In train mode, the input must be a csv file that includes two columns where the first is text, and second column is label.
#            The first row must be header of columns.
#
#            In predict mode, the input must be a tsv file that includes only one column where the first is text.
#            The first row must be header of column.
#
#    lab2ind: dictionary of label classes
#    tokenizer: BERT tokenizer
#    max_len: maximal length of input sequence
#    mode: train or predict
#    '''
#
#    if mode == 'train':
#        #print("Data size ", dataset.shape)
#        labels = dataset.classification.values
#        
#
#        # Create sentence and label lists
#        labels = [lab2ind[label] for label in labels] 
#         
#
#        print("Label is ", labels[0])
#        # Convert data into torch tensors
#
#        labels = torch.tensor(labels)
#    




    # Create sentence and label lists
    content = dataset.headlineText.values
  

    # We need to add a special token at the beginning for BERT to work properly.
    content = ["[CLS] " + text for text in content]
  

    # Import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.
    tokenized_texts = [tokenizer.tokenize(text) for text in content]

    
    # if the sequence is longer the maximal length, we truncate it to the pre-defined maximal length
    #tokenized_texts = [ text[:max_len+1] for text in tokenized_texts]

    # We also need to add a special token at the end.
    tokenized_texts = [ text+['[SEP]'] for text in tokenized_texts]
    print ("Tokenize the first sentence:\n",tokenized_texts[0])
    
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    print ("Index numbers of the first sentence:\n",input_ids[0])

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
    input_ids = pad_sequences(input_ids, dtype="long", truncating="post", padding="post", value=pad_ind)
    # input_ids = pad_sequences(input_ids, dtype="long", truncating="post", padding="post", value=pad_ind)
    print ("Index numbers of the first sentence after padding:\n",input_ids[0])

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for pad tokens
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, labels, masks





In [9]:
model_path = "bert-large-uncased"
# define label to number dictionary
lab2ind = {-1: 0, 1: 1}

# tokenizer from pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased',do_lower_case=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
#train_inputs, train_labels, train_masks = json_data_prepare(train, tokenizer, lab2ind)
#validation_inputs, validation_labels, validation_masks = json_data_prepare(valid, tokenizer, lab2ind)

In [11]:
# Use defined funtion to extract data
train_inputs, train_labels, train_masks = data_prepare('./drive/My Drive/finance_news_data/finan_news_train.csv', tokenizer, lab2ind)
validation_inputs, validation_labels, validation_masks = data_prepare('./drive/My Drive/finance_news_data/finan_news_dev.csv', tokenizer, lab2ind)

Data size  (786, 2)
Label is  0
Tokenize the first sentence:
 ['[CLS]', '"', 'u', '.', 's', '.', 'economic', 'growth', 'has', 'sharply', 'dec', '##ele', '##rated', 'since', 'early', 'december', '.', 'in', 'the', 'current', 'macro', 'environment', ',', 'we', 'recommend', 'investors', 'own', 'stocks', '.', '[SEP]']
Index numbers of the first sentence:
 [101, 1000, 1057, 1012, 1055, 1012, 3171, 3930, 2038, 9249, 11703, 12260, 9250, 2144, 2220, 2285, 1012, 1999, 1996, 2783, 26632, 4044, 1010, 2057, 16755, 9387, 2219, 15768, 1012, 102]
Index numbers of the first sentence after padding:
 [  101  1000  1057  1012  1055  1012  3171  3930  2038  9249 11703 12260
  9250  2144  2220  2285  1012  1999  1996  2783 26632  4044  1010  2057
 16755  9387  2219 15768  1012   102     0     0     0     0]
Data size  (169, 2)
Label is  1
Tokenize the first sentence:
 ['[CLS]', 'or', '2', ')', 'it', "'", 's', 'a', 'sign', 'of', 'a', 'strengthening', 'economy', ',', 'which', 'means', 'inflation', 'will', 'pi

In [12]:
train_inputs.shape

torch.Size([786, 34])

In [0]:
batch_size = 32
# We'll take training samples in random order in each epoch. 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, 
                              sampler = RandomSampler(train_data), # Select batches randomly
                              batch_size=batch_size)

# We'll just read validation set sequentially.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, 
                                   sampler = SequentialSampler(validation_data), # Pull out batches sequentially.
                                   batch_size=batch_size)

## Loading pre-trained model

In [15]:
model_path = "bert-large-uncased"

bert_model = BertModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=True).to(device)

HBox(children=(IntProgress(value=0, description='Downloading', max=362, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=1344997306, style=ProgressStyle(description…




In [0]:
dataiter = iter(train_dataloader)
batch = dataiter.next()
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
input_ids, input_mask, labels = batch

In [0]:
last_hidden_state, pooler_output, hidden_states, attentions = bert_model(input_ids, attention_mask = input_mask)

In [18]:
last_hidden_state.shape

torch.Size([32, 34, 1024])

In [19]:
pooler_output.shape

torch.Size([32, 1024])

## Creating `Bert_cls` class

In [0]:
class Bert_cls(nn.Module):
    def __init__(self, model_path, hidden_size):
        super(Bert_cls, self).__init__()
        self.model_path = model_path
        self.hidden_size = hidden_size
        self.bert_model = BertModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=True)
        self.label_num = 2
        self.fc = nn.Linear(self.hidden_size, self.label_num)
    def forward(self, bert_ids, bert_mask):
        last_hidden_state, pooler_output, hidden_states, attentions = self.bert_model(input_ids=bert_ids)
        fc_output = self.fc(pooler_output)
        return fc_output, attentions

In [0]:
bert_model = Bert_cls('bert-large-uncased', 1024).to(device)

## Optimizer and Learning Rate Scheduler

In [0]:
# Parameters:
lr = 2e-5
max_grad_norm = 1.0
epochs = 3
warmup_proportion = 0.1
num_training_steps  = len(train_dataloader) * epochs
num_warmup_steps = num_training_steps * warmup_proportion

### In Transformers, optimizer and schedules are instantiated like this:
# Note: AdamW is a class from the huggingface library
# the 'W' stands for 'Weight Decay"
optimizer = AdamW(bert_model.parameters(), lr=lr, correct_bias=False)
# schedules
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

# We use nn.CrossEntropyLoss() as our loss function. 
criterion = nn.CrossEntropyLoss()

# Model training

In [0]:
def train(model, iterator, optimizer, scheduler, criterion):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        input_ids, input_mask, labels = batch

        outputs,_ = model(input_ids, input_mask)

        loss = criterion(outputs, labels)
        # delete used variables to free GPU memory
        del batch, input_ids, input_mask, labels
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.cpu().item()
        optimizer.zero_grad()
    
    # free GPU memory
    if device == 'cuda':
        torch.cuda.empty_cache()

    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    all_pred=[]
    all_label = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            input_ids, input_mask, labels = batch

            outputs,_ = model(input_ids, input_mask)
            
            loss = criterion(outputs, labels)

            # delete used variables to free GPU memory
            del batch, input_ids, input_mask
            epoch_loss += loss.cpu().item()

            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(labels.cpu())
    
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return epoch_loss / len(iterator), accuracy, f1score

In [25]:
# Train the model
loss_list = []
acc_list = []

for epoch in trange(epochs, desc="Epoch"):
    train_loss = train(bert_model, train_dataloader, optimizer, scheduler, criterion)  
    val_loss, val_acc, val_f1 = evaluate(bert_model, validation_dataloader, criterion)

    # Create checkpoint at end of each epoch
    state = {
        'epoch': epoch,
        'state_dict': bert_model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict()
        }

    torch.save(state, "./drive/My Drive/finance_news_data/ckpt_BERT/BERT_"+str(epoch+1)+".pt")

    print('\n Epoch [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}'.format(epoch+1, epochs, train_loss, val_loss, val_acc, val_f1))
    

Epoch:  33%|███▎      | 1/3 [01:20<02:41, 80.80s/it]


 Epoch [1/3], Train Loss: 0.6676, Validation Loss: 0.6043, Validation Accuracy: 0.6686, Validation F1: 0.5818


Epoch:  67%|██████▋   | 2/3 [02:57<01:25, 85.72s/it]


 Epoch [2/3], Train Loss: 0.4597, Validation Loss: 0.4812, Validation Accuracy: 0.8225, Validation F1: 0.8163


Epoch: 100%|██████████| 3/3 [04:34<00:00, 91.34s/it]


 Epoch [3/3], Train Loss: 0.2552, Validation Loss: 0.5350, Validation Accuracy: 0.8166, Validation F1: 0.8068



