In [1]:
import csv
import tensorflow as tf
import os
from bert import run_classifier, optimization, tokenization

In [2]:
class DataProcessor(object):
    """Base class for data process"""
    def get_train_examples(self, data_dir):
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        raise NotImplementedError()

    def get_test_examples(self, data_dir):
        raise NotImplementedError()

    def get_labels(self, label):
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, file_path, quotechar=None):
        with tf.gfile.Open(file_path, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = set()
            for (i, line) in enumerate(reader):
                if i == 0:
                    continue
                lines.add(tuple(line))
            lines = list(lines)
        return lines

In [3]:
class CarFeedbackProcessor(DataProcessor):
    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self, labels):
        return set(labels)

    def _create_examples(self, lines, set_type):
        examples = []
        labels = []
        labels_test = []
        for (i, line) in enumerate(lines):
            guid = set_type + "-" + str(i)
            text_a = tokenization.convert_to_unicode(line[0])
            label = tokenization.convert_to_unicode(line[1])
            labels.append(label)
            if set_type == "test":
                label = "0"
            labels_test.append(label)
            examples.append(
                run_classifier.InputExample(guid=guid,
                                            text_a=text_a,
                                            text_b=None,
                                            label=label))
        return examples, labels, labels_test

In [4]:
DATA_DIR = "./DATA_DIR"
OUTPUT_DIR = "./OUTPUT_DIR"
VOCAB_FILE = "./BERT_BASE_DIR/vocab.txt"
BERT_CONFIG_FILE = "./BERT_BASE_DIR/bert_config.json"
INIT_CHECKPOINT = "./BERT_BASE_DIR/bert_model.ckpt"
MAX_SEQ_LENGTH = 128
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 10.0
WARMUP_PROPORTION = 0.1
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 50

In [5]:
carFeedbackProcessor = CarFeedbackProcessor()

In [6]:
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE,
                                       do_lower_case=True)

In [7]:
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

In [8]:
train_examples,labels,_ = carFeedbackProcessor.get_train_examples(data_dir=DATA_DIR)

In [9]:
len(train_examples)

38991

In [10]:
labels = carFeedbackProcessor.get_labels(labels)

In [11]:
labels

{'driving',
 'exterior',
 'fuel economy',
 'interior',
 'performance',
 'safety',
 'space'}

In [12]:
test_examples,t_labels,t_labels_test = carFeedbackProcessor.get_test_examples(data_dir=DATA_DIR)

In [13]:
len(test_examples)

29978

In [15]:
t_labels = carFeedbackProcessor.get_labels(t_labels)

In [16]:
t_labels

{'driving',
 'exterior',
 'fuel economy',
 'interior',
 'performance',
 'safety',
 'space'}

In [17]:
t_labels_test = carFeedbackProcessor.get_labels(t_labels_test)

In [18]:
t_labels_test

{'0'}

In [20]:
train_file = os.path.join(DATA_DIR, "train.tf_record")

In [21]:
train_file

'./DATA_DIR/train.tf_record'

In [31]:
run_classifier.file_based_convert_examples_to_features(train_examples,labels,MAX_SEQ_LENGTH,tokenizer,train_file)

INFO:tensorflow:Writing example 0 of 38991
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-0
INFO:tensorflow:tokens: [CLS] with either drive ##train flex ample power mission mind handling almost touches fun [SEP]
INFO:tensorflow:input_ids: 101 2007 2593 3298 23654 23951 20851 2373 3260 2568 8304 2471 12817 4569 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [32]:
test_file = os.path.join(DATA_DIR, "test.tf_record")

In [33]:
test_file

'./DATA_DIR/test.tf_record'

In [30]:
run_classifier.file_based_convert_examples_to_features(test_examples,t_labels_test,MAX_SEQ_LENGTH,tokenizer,test_file)

INFO:tensorflow:Writing example 0 of 29978
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-0
INFO:tensorflow:tokens: [CLS] the h excel ##s road ##ing thanks electronically controlled four wheel drive system road tuned traction control under ##body shielding optional rear locking differential buyer choice two tire packages good ##year inch terrain tires bridges ##tone inch road tires [SEP]
INFO:tensorflow:input_ids: 101 1996 1044 24970 2015 2346 2075 4283 28926 4758 2176 5217 3298 2291 2346 15757 16493 2491 2104 23684 25553 11887 4373 14889 11658 17634 3601 2048 12824 14555 2204 29100 4960 9291 13310 7346 5524 4960 2346 13310 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 