In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Helper

In [2]:
import string
import re


def remove_hex(text):
    """
    Remove Hex
    Example: 
    "\xe3\x80\x90Ramadan\xe3\x80\x91Dompet wanita multi-fungsi gesper dompet multi-card"
    """
    res = []
    i = 0
    while i < len(text):
        if text[i] == "\\" and i + 1 < len(text) and text[i + 1] == "x":
            i += 3
            res.append(" ")
        else:
            res.append(text[i])
        i += 1
    return "".join(res)


def remove_multiple_whitespace(text):
    """
    remove multiple whitespace
    it covers tabs and newlines also
    """
    return re.sub(' +', ' ', text.replace('\n', ' ').replace('\t', ' ')).strip()


def remove_punctuation(text):
    """
    Removing punctuations
    """
    return re.sub(r'[^\w\s]', r' ', text)


def remove_space_between_quantity(text):
    """
    200 ml -> 200ml
    3 kg -> 3kg
    200 x 200 -> 200x200
    3 in 1 -> 3in1
    Example: "Double Tape DOUBLE FOAM TAPE 55 mm 45 m 45 makan   2000 x 2000 scs"
    """
    text = re.sub(r"([1-9][0-9]*)(in|inch|INCH|Inch|In)( |$)", r'\1inch ', text)
    text = re.sub(r"([1-9][0-9]*)(m|meter|M|METER|Meter)( |$)", r'\1m ', text)
    text = re.sub(r"([1-9][0-9]*)(mm|milimeter|MM|MILIMETER|Mm)( |$)", r'\1mm ', text)
    text = re.sub(r"([1-9][0-9]*)(cm|centimeter|CENTIMETER|CM|Cm)( |$)", r'\1ccm ', text)
    text = re.sub(r"([1-9][0-9]*)(pc|pcs|potong|pasang|Pasang|PCS|PC|Pc|Pcs)( |$)", r'\1pcs ', text)
    text = re.sub(r"([1-9][0-9]*)(y|year|thn|tahun|Year|Tahun)( |$)", r'\1tahun ', text)
    text = re.sub(r"([1-9][0-9]*)(k|kilo|Kilo|kg|kilogram|KG|Kg|Kilogram)( |$)", r'\1kg ', text)
    text = re.sub(r"([1-9][0-9]*)(g|gr|gram|G|Gr|GR|GRAM|Gram)( |$)", r'\1gr ', text)
    text = re.sub(r"([1-9][0-9]*)(l|liter|L|Liter|LITER)( |$)", r'\1l ', text)
    text = re.sub(r"([1-9][0-9]*)(ml|mililiter|ML|mL|Ml)( |$)", r'\1ml ', text)
    text = re.sub(r"([1-9][0-9]*) (in|inch|INCH|Inch|In)( |$)", r'\1inch ', text)
    text = re.sub(r"([1-9][0-9]*) (m|meter|M|METER|Meter)( |$)", r'\1m ', text)
    text = re.sub(r"([1-9][0-9]*) (mm|milimeter|MM|MILIMETER|Mm)( |$)", r'\1mm ', text)
    text = re.sub(r"([1-9][0-9]*) (cm|centimeter|CENTIMETER|CM|Cm)( |$)", r'\1ccm ', text)
    text = re.sub(r"([1-9][0-9]*) (pc|pcs|potong|pasang|Pasang|PCS|PC|Pc|Pcs)( |$)", r'\1pcs ', text)
    text = re.sub(r"([1-9][0-9]*) (y|year|thn|tahun|Year|Tahun)( |$)", r'\1tahun ', text)
    text = re.sub(r"([1-9][0-9]*) (k|kilo|Kilo|kg|kilogram|KG|Kg|Kilogram)( |$)", r'\1kg ', text)
    text = re.sub(r"([1-9][0-9]*) (g|gr|gram|G|Gr|GR|GRAM|Gram)( |$)", r'\1gr ', text)
    text = re.sub(r"([1-9][0-9]*) (l|liter|L|Liter|LITER)( |$)", r'\1l ', text)
    text = re.sub(r"([1-9][0-9]*) (ml|mililiter|ML|mL|Ml)( |$)", r'\1ml ', text)

    text = re.sub(r"([1-9][0-9]*) (yard|set|lembar|tablet|kaplet|buah|box|sachet|pasang|gb|watt)( |$)", r'\1\2 ', text)

    text = re.sub(r"([1-9][0-9]*) (x) ([1-9][0-9]*)", r'\1x\3', text)
    text = re.sub(r"([1-9][0-9]*) (in) ([1-9][0-9]*)", r'\1in\3', text)
    return text

Raw Utils

In [3]:
import random
import numpy as np
import torch


def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

Preprocess

In [4]:
import os
import sys
import re

# from helper import remove_hex, remove_multiple_whitespace


def convert_raw_to_enimex(input_file: str, output_file: str):
    with open(input_file, 'r', encoding="ascii", errors='ignore') as f, open(output_file, 'w') as out:
        lines = f.readlines()
        i = 0
        while i < len(lines):
            line = lines[i][:-1]
            while i < len(lines) and line[-1] != "\"":
                i += 1
                if i < len(lines):
                    line += lines[i][:-1]
            i += 1
            # print(i)
            line = line[1:-1]
            line = line.replace("<", " <").replace(">", "> ")
            line = line.replace("\n", "")
            line = line.replace("	", "")
            line = remove_multiple_whitespace(line)
            line = remove_hex(line)
            line = remove_space_between_quantity(line)
            line = "\"" + line + "\""
            line += '\n'
            out.write(line)


def convert_enimex_to_stanford(input_file: str, output_file: str):
    '''
    Convert ENAMEX Named-Entity annotated file to Stanford NLP format (token-based)
    @Author research.dimas@gmail
    ENAMEX example:
    Studies on magnesium\'s mechanism of action in <ENAMEX TYPE="plant">digitalis</ENAMEX> -induced <ENAMEX TYPE="disease">arrhythmias</ENAMEX> .
    '''

    START_PATTERN = re.compile(r'^(.*?)<ENAMEX$', re.I)
    END_SINGLE_PATTERN = re.compile(r'^TYPE="(.*?)">(.*?)</ENAMEX>(.*?)$', re.I)
    TYPE_PATTERN = re.compile(r'^TYPE="(.*?)">(.*?)$', re.I)
    END_MULTI_PATTERN = re.compile(r'^(.*?)</ENAMEX>(.*?)$', re.I)
    EOS_PATTERN = re.compile(r'^([^<>]*)\.?	(\d+)$', re.I)
    NON_ENTITY_TYPE = 'O'

    def check_and_process_eos(token):
        match = re.match(EOS_PATTERN, token)
        if match:
            out.write(match.group(1) + '	' + cur_type + '\n')
            out.write('.' + '	' + cur_type + '\n')
            out.write('\n')
            return True
        return False

    cur_type = NON_ENTITY_TYPE
    # print(infile)
    with open(input_file, 'r', encoding="ascii", errors='ignore') as f, open(output_file, 'w') as out:
        lines = f.readlines()
        i = 0
        while i < len(lines):
            line = lines[i][:-1]
            i += 1
            line = remove_multiple_whitespace(line)
            for token in line.strip().split(' '):
                token = token.strip()
                if not token:
                    continue

                match = re.match(START_PATTERN, token)
                if match:
                    if match.group(1):
                        out.write(match.group(1) + '	' +
                                  NON_ENTITY_TYPE + '\n')
                    continue

                match = re.match(END_SINGLE_PATTERN, token)
                if match:
                    out.write(match.group(2) + '	' + match.group(1) + '\n')
                    cur_type = NON_ENTITY_TYPE
                    if not check_and_process_eos(match.group(3)):
                        out.write(match.group(3) + '	' + cur_type + '\n')
                    continue

                match = re.match(TYPE_PATTERN, token)
                if match:
                    cur_type = match.group(1)
                    out.write(match.group(2) + '	' + cur_type + '\n')
                    continue

                match = re.match(END_MULTI_PATTERN, token)
                if match:
                    out.write(match.group(1) + '	' + cur_type + '\n')
                    cur_type = NON_ENTITY_TYPE
                    if not check_and_process_eos(match.group(2)):
                        out.write(match.group(2) + '	' + cur_type + '\n')
                    continue

                if check_and_process_eos(token):
                    continue

                out.write(token + '	' + cur_type + '\n')


def convert_stanford_to_bio(input_file: str, output_file: str):
    '''
    Convert ENAMEX Named-Entity annotated file to Stanford NLP format (token-based)
    @Author research.dimas@gmail
    ENAMEX example (2 sentences):
    Studies on magnesium\'s mechanism of action in <ENAMEX TYPE="plant">digitalis</ENAMEX> -induced <ENAMEX TYPE="disease">arrhythmias</ENAMEX> .
    '''

    NON_ENTITY_TYPE = 'O'

    cur_type = NON_ENTITY_TYPE
    with open(input_file, 'r', encoding="ascii", errors='ignore') as f, open(output_file, 'w') as out:
        prev = None
        prev_dot = False  # avoid printing double dot
        is_last = False
        for line in f.readlines():
            tokens = line.split('	')
            token, cur_type = tokens[0], tokens[1][:-1]
            if not token or token == "":
                continue

            if len(token) > 2 and token[0] == "\"" and token[-1] == "\"":
                token = token[1:-1]
            elif len(token) > 1 and token[0] == "\"":
                token = token[1:]
                out.write('\n')
            elif len(token) > 1 and token[-1] == "\"":
                token = token[:-1]

            if token == "\"":
                if not prev_dot:
                    out.write("." + '	' + NON_ENTITY_TYPE + '\n')
                    prev_dot = True
                    out.write('\n')
                prev = None
            else:
                token = token.lower()
                if token[-1] == "\"":
                    token = token[:-1]
                    is_last = True

                if cur_type == NON_ENTITY_TYPE:
                    out.write(token + '	' + cur_type + '\n')
                else:
                    if not prev:
                        out.write(token + '	B-' + cur_type + '\n')
                    else:
                        if prev == cur_type:
                            out.write(token + '	I-' + cur_type + '\n')
                        else:
                            out.write(token + '	B-' + cur_type + '\n')
                prev = cur_type
                prev_dot = False

                if is_last:
                    prev = None
                    if not prev_dot:
                        out.write("." + '	' + NON_ENTITY_TYPE + '\n')
                        prev_dot = True
                        out.write('\n')
                    is_last = False


def filter_bio(input_file: str, output_file: str):
    def filter(s):
        res = []
        for token in s[:-1]:  # unfilter last sentence
            word = token.split("	")[0]
            tag = token.split("	")[1]
            word = remove_punctuation(word)
            word = remove_multiple_whitespace(word)

            if word != "":
                res.append(word + "	" + tag)
        return "".join(res)

    with open(input_file, 'r', encoding="ascii", errors='ignore') as f, open(output_file, 'w') as out:
        l = 0
        s = []
        for line in f.readlines():
            if line[:-1] == "":
                if l > 3:
                    s = filter(s)
                    out.write(s)
                    out.write("\n")
                l = 0
                s = []
            else:
                l += 1
                s.append(line)


if __name__ == "__main__":
    input_file = "/content/drive/MyDrive/Rearch_Dimasataset-plant-disease-corpus.txt"
    output_file = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/BIO/enimex.txt"

    convert_raw_to_enimex(input_file=input_file, output_file=output_file)

    input_file = output_file
    output_file = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/BIO/stanford.txt"

    convert_enimex_to_stanford(input_file=input_file, output_file=output_file)

    input_file = output_file
    output_file = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/BIO/BIO.txt"

    convert_stanford_to_bio(input_file=input_file, output_file=output_file)

    input_file = output_file
    output_file = "/content/drive/MyDrive/Rearch_Dimas/NER-DATASET/BIO/final-data.txt"

    filter_bio(input_file=input_file, output_file=output_file)

In [5]:
! ls -lh /content/drive/MyDrive/Rearch_Dimas/NER-DATASET/BIO

total 1.7M
-rw------- 1 root root 336K Jul 15 02:39 annotated-dataset-plant-disease-corpus.txt
-rw------- 1 root root 344K Jul 15 02:42 BIO.txt
-rw------- 1 root root 342K Jul 15 02:42 enimex.txt
-rw------- 1 root root 326K Jul 15 02:42 final-data.txt
-rw------- 1 root root 393K Jul 15 02:42 stanford.txt
