In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Splitting Raw Data**

In [18]:
import random
import numpy as np
import torch

# from raw_utils import set_seed

def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


def save_data(data, fpath):
    with open(fpath, 'w') as out:
        for instance in data:
            for token in instance:
                out.write(token)
            out.write("\n")


if __name__ == "__main__":

    set_seed(26092020)

    fpath = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/BIO/final-data.txt"

    file = open(fpath)
    lines = file.readlines()
    file.close()

    data = []
    instance = []

    for l in lines:
        if l[:-1] == "":  # if it's empty
            data.append(instance)
            instance = []
        else:
            instance.append(l)

    random.shuffle(data)

    train_size = int(0.6 * len(data))

    train_fpath = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/raw_data/train.txt"
    test_fpath = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/raw_data/test.txt"
    dev_fpath = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/raw_data/dev.txt"

    save_data(data[:train_size], train_fpath)
    save_data(data[train_size:], test_fpath)
    save_data(data[train_size + 24:], dev_fpath)

In [19]:
! ls -lh /content/drive/MyDrive/Rearch_Dimas/NER-DATASET/raw_data

total 451K
-rw------- 1 root root 125K Jul 15 02:51 dev.txt
-rw------- 1 root root 131K Jul 15 02:51 test.txt
-rw------- 1 root root 195K Jul 15 02:51 train.txt


# **Splitting Mid Data**

In [20]:
import os
import re
import json

root_path = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/"

def preprocess(input_path, save_path, mode):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    data_path = os.path.join(save_path, mode + ".json")
    labels = set()
    result = []
    tmp = {}
    tmp['id'] = 0
    tmp['text'] = ''
    tmp['labels'] = []

    with open(input_path, 'r', encoding='utf-8') as fp:
        lines = fp.readlines()
        texts = []
        entities = []
        words = []
        entity_tmp = []
        entities_tmp = []
        for line in lines:
            line = line.strip().split("	")
            if len(line) == 2:
                word = line[0]
                label = line[1]
                words.append(word)

                if "B-" in label:
                    entity_tmp.append(word)
                    if (" ".join(entity_tmp), label.split("-")[-1]) not in entities_tmp:
                        entities_tmp.append(("".join(entity_tmp), label.split("-")[-1]))
                    labels.add(label.split("-")[-1])
                    entity_tmp = []

                elif "I-" in label:
                    entity_tmp.append(word)
                    if (" ".join(entity_tmp), label.split("-")[-1]) not in entities_tmp:
                        entities_tmp.append(("".join(entity_tmp), label.split("-")[-1]))
                    entity_tmp = []
                    labels.add(label.split("-")[-1])
            else:
                texts.append(" ".join(words))
                entities.append(entities_tmp)
                words = []
                entities_tmp = []

    i = 0
    for text, entity in zip(texts, entities):

        if entity:
            ltmp = []
            for ent, type in entity:
                for span in re.finditer(ent, text):
                    start = span.start()
                    end = span.end()
                    ltmp.append((type, start, end, ent))
                    # print(ltmp)
            ltmp = sorted(ltmp, key=lambda x: (x[1], x[2]))
            tmp['id'] = i
            tmp['text'] = text
            for j in range(len(ltmp)):
                tmp['labels'].append(["T{}".format(str(j)), ltmp[j][0], ltmp[j][1], ltmp[j][2], ltmp[j][3]])
        else:
            tmp['id'] = i
            tmp['text'] = text
            tmp['labels'] = []
        result.append(tmp)
        # print(i, text, entity, tmp)
        tmp = {}
        tmp['id'] = 0
        tmp['text'] = ''
        tmp['labels'] = []
        i += 1

    with open(data_path, 'w', encoding='utf-8') as fp:
        fp.write(json.dumps(result, ensure_ascii=False))

    if mode == "train":
        label_path = os.path.join(save_path, "labels.json")
        with open(label_path, 'w', encoding='utf-8') as fp:
            fp.write(json.dumps(list(labels), ensure_ascii=False))


preprocess(root_path + "raw_data/train.txt", root_path + "mid_data", "train")
preprocess(root_path + "raw_data/dev.txt", root_path + "mid_data", "dev")
preprocess(root_path + "raw_data/test.txt", root_path + "mid_data", "test")

labels_path = os.path.join(root_path + "mid_data/labels.json")
with open(labels_path, 'r') as fp:
    labels = json.load(fp)

tmp_labels = []
tmp_labels.append('O')
for label in labels:
    tmp_labels.append('B-' + label)
    tmp_labels.append('I-' + label)

label2id = {}
for k, v in enumerate(tmp_labels):
    label2id[v] = k
path = root_path + "mid_data/"
if not os.path.exists(path):
    os.makedirs(path)
with open(os.path.join(path, "nor_ent2id.json"), 'w') as fp:
    fp.write(json.dumps(label2id, ensure_ascii=False))


In [21]:
! ls -lh /content/drive/MyDrive/Rearch_Dimas/NER-DATASET/mid_data

total 612K
-rw------- 1 root root 169K Jul 15 02:51 dev.json
-rw------- 1 root root   20 Jul 15 02:51 labels.json
-rw------- 1 root root   68 Jul 15 02:51 nor_ent2id.json
-rw------- 1 root root 177K Jul 15 02:51 test.json
-rw------- 1 root root 265K Jul 15 02:51 train.json
