In [2]:
!pip install pytorch_pretrained_bert
!git clone https://github.com/BMaksim/classifocation-with-BERT

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 17.8MB/s eta 0:00:01[K     |█████▎                          | 20kB 1.8MB/s eta 0:00:01[K     |████████                        | 30kB 2.6MB/s eta 0:00:01[K     |██████████▋                     | 40kB 1.7MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 2.1MB/s eta 0:00:01[K     |███████████████▉                | 61kB 2.5MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 2.9MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 2.3MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 2.5MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 2.8MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 2.8MB/s eta 0:00:01[K     |██████████████████████

In [0]:
from __future__ import absolute_import, division, print_function

import csv
import os
import sys
import logging

logger = logging.getLogger()
csv.field_size_limit(2147483647) # Increase CSV reader's field limit incase we have long text.


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines


class BinaryProcessor(DataProcessor):
    """Processor for binary classification dataset."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def convert_example_to_feature(example_row):
    # return example_row
    example, label_map, max_seq_length, tokenizer, output_mode = example_row

    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    if tokens_b:
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if output_mode == "classification":
        label_id = label_map[example.label]
    elif output_mode == "regression":
        label_id = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         label_id=label_id)


In [0]:
import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.nn import CrossEntropyLoss
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [7]:

batch_size = 24
epohs = 2
grad_accum = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = BinaryProcessor()
train_examples = processor.get_train_examples("classifocation-with-BERT/data/")
train_examples_len = len(train_examples)
label_list = processor.get_labels()

tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, 128, tokenizer, 'classification') for example in train_examples]
train_features = list(map(convert_example_to_feature, train_examples_for_processing))

with open("classifocation-with-BERT/data/" + "train_features.pkl", "wb") as f:
    pickle.dump(train_features, f)


model = BertForSequenceClassification.from_pretrained("bert-base-cased", cache_dir="cache/", num_labels=2)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr = 2e-5,
                     warmup = 0.1,
                     t_total = int(train_examples_len / batch_size / grad_accum) * epohs)

global_step = 0
nb_tr_steps = 0
tr_loss = 0

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)


train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

model.train()
for _ in range(epohs):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        logits = model(input_ids, segment_ids, input_mask, labels=None)
        loss_f = CrossEntropyLoss()
        loss = loss_f(logits.view(-1, 2), label_ids.view(-1))
        loss.backward()
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % grad_accum == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

model_to_save = model.module if hasattr(model, 'module') else model

torch.save(model_to_save.state_dict(), "classifocation-with-BERT/outputs/pytorch_model.bin")
model_to_save.config.to_json_file("classifocation-with-BERT/outputs/config.json")
tokenizer.save_vocabulary("classifocation-with-BERT/outputs/")

100%|██████████| 213450/213450 [00:00<00:00, 380994.17B/s]
100%|██████████| 404400730/404400730 [00:31<00:00, 12846959.85B/s]
Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


FileNotFoundError: ignored

In [0]:
#os.system("cd classifocation-with-BERT/\n")
#!ls classifocation-with-BERT/

convert_examples_to_features.py  outputs      text_to_tsv.py   tools.py
data				 __pycache__  text.txt	       Training.py
fff.py				 README.md    tokenisation.py


In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def report(labels, preds):
    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average="binary")
    f1 = f1_score(labels, preds, average="binary")
    precision = precision_score(labels, preds, average="binary")
    return {"accuracy": accuracy, "recall": recall, "f1": f1, "precision": precision}


tokenizer = BertTokenizer.from_pretrained('classifocation-with-BERT/outputs/vocab.txt', do_lower_case=False)

processor = BinaryProcessor()
eval_examples = processor.get_dev_examples("classifocation-with-BERT/data/")
label_list = processor.get_labels() 
num_labels = 2
eval_examples_len = len(eval_examples)

label_map = {label: i for i, label in enumerate(label_list)}
eval_examples_for_processing = [(example, label_map, 128, tokenizer, "classification") for example in eval_examples]

process_count = cpu_count() - 1
eval_features = list(map(convert_example_to_feature, eval_examples_for_processing))
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)


eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=8)

model = BertForSequenceClassification.from_pretrained("classifocation-with-BERT/outputs/bert.tar.gz", cache_dir="cache/", num_labels=2)
model.to(device)

model.eval()
eval_loss = 0
nb_eval_steps = 0
preds = []

for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)

    
    loss_f = CrossEntropyLoss()
    loss = loss_f(logits.view(-1, num_labels), label_ids.view(-1))   
    eval_loss += loss.mean().item()
    nb_eval_steps += 1
    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

eval_loss = eval_loss / nb_eval_steps
preds = preds[0]

preds = np.argmax(preds, axis=1)
result = report(all_label_ids.numpy(), preds)

print(result)




{'accuracy': 0.8629032258064516, 'recall': 0.8640776699029126, 'f1': 0.7970149253731345, 'precision': 0.739612188365651}


0