In [1]:
import csv
import tensorflow as tf
import os
from bert import run_classifier, optimization, tokenization
import pandas as pd
import random

In [2]:
def split_train_vail_test_data(file_path):
    with open(file_path, "r") as f:
        data = f.readlines()
        data_size = len(data) - 1
        title = data[:1]
        data = data[1:]
        random.shuffle(data)
        train_data = title + data[:int(data_size * 0.8)]
        test_data = title + data[int(data_size * 0.8):]
        validate_data = train_data[:int(len(train_data) * 0.2) + 1]
        print("total size: " + str(data_size))
        print("train size: " + str(len(train_data) - 1))
        print("validate size: " + str(len(validate_data) - 1))
        print("test size: " + str(len(test_data) - 1))
        folder_path = file_path[:file_path.rindex("/") + 1]
        with open(folder_path + "shuffle.csv", "w") as f:
            f.writelines(title + data)
        with open(folder_path + "train.csv", "w") as f:
            f.writelines(train_data)
        with open(folder_path + "test.csv", "w") as f:
            f.writelines(test_data)
        with open(folder_path + "validate.csv", "w") as f:
            f.writelines(validate_data)

In [3]:
# chinese data
split_train_vail_test_data("./data/car_reviews_final/chinese/car_reviews_cn.csv")

total size: 5578
train size: 4462
validate size: 892
test size: 1116


In [37]:
# english data
split_train_vail_test_data("./data/car_reviews_final/english/car_reviews_en.csv")

total size: 5578
train size: 4462
validate size: 892
test size: 1116


In [4]:
class DataProcessor(object):
    """Base class for data process"""
    def get_shuffle_examples(self, data_dir):
        raise NotImplementedError()

    def get_train_examples(self, data_dir):
        raise NotImplementedError()

    def get_validate_examples(self, data_dir):
        raise NotImplementedError()

    def get_test_examples(self, data_dir, test_label):
        raise NotImplementedError()

    def get_labels(self, label):
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, file_path, quotechar=None):
        with tf.gfile.Open(file_path, "r") as f:
            reader = csv.reader(f, delimiter=",", quotechar=quotechar)
            lines = []
            for (i, line) in enumerate(reader):
                if i == 0:
                    continue
                lines.append(line)
        return lines

In [5]:
class InputExampleProcessor(DataProcessor):
    def get_shuffle_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "shuffle.csv")), "train")

    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.csv")), "train")

    def get_validate_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "validate.csv")), "validate")

    def get_test_examples(self, data_dir, test_label):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.csv")), "test",
            test_label)

    def get_labels(self, labels):
        return list(set(labels))

    def _create_examples(self, lines, set_type, *test_label):
        examples = []
        labels = []
        labels_test = []
        for (i, line) in enumerate(lines):
            guid = set_type + "-" + str(i)
            label = tokenization.convert_to_unicode(line[0])
            text_a = tokenization.convert_to_unicode(line[1])
            labels.append(label)
            if set_type == "test":
                label = test_label
            labels_test.append(label)
            examples.append(
                run_classifier.InputExample(guid=guid,
                                            text_a=text_a,
                                            text_b=None,
                                            label=label))
        return examples, labels, labels_test

In [6]:
# chinese data process
DATA_DIR = "./data/car_reviews_final/chinese"
VOCAB_FILE = "./bert_pretrain_model/BERT_Base_Chinese/chinese_L-12_H-768_A-12/vocab.txt"
MAX_SEQ_LENGTH = 128

In [12]:
# english data process
DATA_DIR = "./data/car_reviews_final/english"
VOCAB_FILE = "./bert_pretrain_model/BERT_Base_Uncased/uncased_L-12_H-768_A-12/vocab.txt"
MAX_SEQ_LENGTH = 128

In [7]:
InputExampleProcessor = InputExampleProcessor()

In [8]:
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE,
                                       do_lower_case=True)

In [9]:
tokenizer.tokenize("空间很足很舒适，后排坐三个还有很大空隙，后备箱箱空间大，装东西非常方便，方便放，方便取，一点不受限。")

['空',
 '间',
 '很',
 '足',
 '很',
 '舒',
 '适',
 '，',
 '后',
 '排',
 '坐',
 '三',
 '个',
 '还',
 '有',
 '很',
 '大',
 '空',
 '隙',
 '，',
 '后',
 '备',
 '箱',
 '箱',
 '空',
 '间',
 '大',
 '，',
 '装',
 '东',
 '西',
 '非',
 '常',
 '方',
 '便',
 '，',
 '方',
 '便',
 '放',
 '，',
 '方',
 '便',
 '取',
 '，',
 '一',
 '点',
 '不',
 '受',
 '限',
 '。']

In [10]:
shuffle_examples, labels, _ = InputExampleProcessor.get_shuffle_examples(
    data_dir=DATA_DIR)

In [11]:
len(shuffle_examples)

5578

In [12]:
labels = InputExampleProcessor.get_labels(labels=labels)

In [13]:
labels

['外观', '操控', '动力', '安全辅助', '空间', '能耗', '内饰']

In [14]:
shuffle_file = os.path.join(DATA_DIR, "shuffle.tf_record")

In [15]:
shuffle_file

'./data/car_reviews_final/chinese/shuffle.tf_record'

In [16]:
run_classifier.file_based_convert_examples_to_features(shuffle_examples,labels,MAX_SEQ_LENGTH,tokenizer,shuffle_file)

INFO:tensorflow:Writing example 0 of 5578
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-0
INFO:tensorflow:tokens: [CLS] 磨 合 期 刚 开 始 17 个 油 ， 现 在 100 公 里 15 个 油 ， 主 要 是 不 太 在 意 油 耗 高 不 高 ， 不 是 太 离 谱 就 行 。 [SEP]
INFO:tensorflow:input_ids: 101 4836 1394 3309 1157 2458 1993 8126 702 3779 8024 4385 1762 8135 1062 7027 8115 702 3779 8024 712 6206 3221 679 1922 1762 2692 3779 5450 7770 679 7770 8024 679 3221 1922 4895 6480 2218 6121 511 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [8]:
train_examples, labels, _ = InputExampleProcessor.get_train_examples(
    data_dir=DATA_DIR)

In [9]:
len(train_examples)

4462

In [10]:
labels = InputExampleProcessor.get_labels(labels)

In [16]:
labels

['车身外观', '空间', '动力/加速', '操控', '内饰', '安全辅助', '能耗']

In [12]:
test_examples,t_labels,t_labels_test = InputExampleProcessor.get_test_examples(data_dir=DATA_DIR,test_label=labels[0])

In [13]:
len(test_examples)

1116

In [14]:
t_labels = InputExampleProcessor.get_labels(t_labels)

In [15]:
t_labels

['control', 'interior', 'power', 'energy', 'appearance', 'safety', 'space']

In [16]:
t_labels_test = InputExampleProcessor.get_labels(t_labels_test)

In [17]:
t_labels_test

['control']

In [25]:
validate_examples, v_labels, v_labels_test = InputExampleProcessor.get_validate_examples(data_dir=DATA_DIR)

In [26]:
len(validate_examples)

892

In [27]:
v_labels = InputExampleProcessor.get_labels(v_labels)

In [28]:
v_labels

['control', 'interior', 'power', 'appearance', 'safety', 'energy', 'space']

In [29]:
v_labels_test = InputExampleProcessor.get_labels(v_labels_test)

In [30]:
v_labels_test

['control', 'interior', 'power', 'appearance', 'safety', 'energy', 'space']

In [18]:
train_file = os.path.join(DATA_DIR, "train.tf_record")

In [19]:
train_file

'./data/car_reviews_final/train.tf_record'

In [20]:
run_classifier.file_based_convert_examples_to_features(train_examples,labels,MAX_SEQ_LENGTH,tokenizer,train_file)

INFO:tensorflow:Writing example 0 of 4462
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-0
INFO:tensorflow:tokens: [CLS] basic enough . but the rear middle bulge is too high to affect the comfort of the rear passengers [SEP]
INFO:tensorflow:input_ids: 101 3937 2438 1012 2021 1996 4373 2690 23708 2003 2205 2152 2000 7461 1996 7216 1997 1996 4373 5467 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [21]:
test_file = os.path.join(DATA_DIR, "test.tf_record")

In [22]:
test_file

'./data/car_reviews_final/test.tf_record'

In [23]:
run_classifier.file_based_convert_examples_to_features(test_examples,t_labels_test,MAX_SEQ_LENGTH,tokenizer,test_file)

INFO:tensorflow:Writing example 0 of 1116
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-0
INFO:tensorflow:tokens: [CLS] work ##manship is not bad . only limited to the cost of plastic . but it is also law - ab ##idi ##ng . the car imitation leather seat is soft and tough . sitting comfortably and looking at the grade . [SEP]
INFO:tensorflow:input_ids: 101 2147 21530 2003 2025 2919 1012 2069 3132 2000 1996 3465 1997 6081 1012 2021 2009 2003 2036 2375 1011 11113 28173 3070 1012 1996 2482 20017 5898 2835 2003 3730 1998 7823 1012 3564 18579 1998 2559 2012 1996 3694 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [31]:
validate_file = os.path.join(DATA_DIR, "validate.tf_record")

In [32]:
validate_file

'./data/car_reviews_final/validate.tf_record'

In [33]:
run_classifier.file_based_convert_examples_to_features(validate_examples,v_labels,MAX_SEQ_LENGTH,tokenizer,validate_file)

INFO:tensorflow:Writing example 0 of 892
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: validate-0
INFO:tensorflow:tokens: [CLS] basic enough . but the rear middle bulge is too high to affect the comfort of the rear passengers [SEP]
INFO:tensorflow:input_ids: 101 3937 2438 1012 2021 1996 4373 2690 23708 2003 2205 2152 2000 7461 1996 7216 1997 1996 4373 5467 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 