In [2]:
import torch
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained("vinai/phobert-base")

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)



INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:

# # For transformers v3.x: 
# # tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

# # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
# line = "Tôi là sinh_viên trường đại_học Công_nghệ ."

# input_ids = torch.tensor([tokenizer.encode(line)])

# with torch.no_grad():
#     features = phobert(input_ids)  # Models outputs are now tuples

In [3]:
line = "Tôi là sinh_viên trường đại_học Công_nghệ ."

input_ids = torch.tensor([tokenizer.encode(line)])


In [4]:
input_ids

tensor([[   0,  218,    8,  649,  212,  956, 2413,    5,    2]])

In [None]:
from __future__ import absolute_import, division, print_function

import argparse
import csv
import json
import logging
import os
import random
import sys

import numpy as np
import torch
import torch.nn.functional as F
from pytorch_transformers import (WEIGHTS_NAME, AdamW, BertConfig,
                                  BertForTokenClassification, BertTokenizer,
                                  WarmupLinearSchedule)
from torch import nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from seqeval.metrics import classification_report

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

class Ner(BertForTokenClassification):

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,valid_ids=None,attention_mask_label=None):
        sequence_output = self.bert(input_ids, token_type_ids, attention_mask,head_mask=None)[0]
        batch_size,max_len,feat_dim = sequence_output.shape
        valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32,device='cuda')
        for i in range(batch_size):
            jj = -1
            for j in range(max_len):
                    if valid_ids[i][j].item() == 1:
                        jj += 1
                        valid_output[i][jj] = sequence_output[i][j]
        sequence_output = self.dropout(valid_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=0)
            # Only keep active parts of the loss
            #attention_mask_label = None
            if attention_mask_label is not None:
                active_loss = attention_mask_label.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, valid_ids=None, label_mask=None):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.valid_ids = valid_ids
        self.label_mask = label_mask

def readfile(filename):
    '''
    read file
    '''
    f = open(filename)
    data = []
    sentence = []
    label= []
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
            if len(sentence) > 0:
                data.append((sentence,label))
                sentence = []
                label = []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        label.append(splits[-1][:-1])

    if len(sentence) >0:
        data.append((sentence,label))
        sentence = []
        label = []
    return data

class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        return readfile(input_file)


class NerProcessor(DataProcessor):
    """Processor for the CoNLL-2003 data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.txt")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "valid.txt")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.txt")), "test")

    def get_labels(self):
        return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"]

    def _create_examples(self,lines,set_type):
        examples = []
        for i,(sentence,label) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = ' '.join(sentence)
            text_b = None
            label = label
            examples.append(InputExample(guid=guid,text_a=text_a,text_b=text_b,label=label))
        return examples

def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list,1)}

    features = []
    for (ex_index,example) in enumerate(examples):
        textlist = example.text_a.split(' ')
        labellist = example.label
        tokens = []
        labels = []
        valid = []
        label_mask = []
        for i, word in enumerate(textlist):
            token = tokenizer.tokenize(word)
            tokens.extend(token)
            label_1 = labellist[i]
            for m in range(len(token)):
                if m == 0:
                    labels.append(label_1)
                    valid.append(1)
                    label_mask.append(1)
                else:
                    valid.append(0)
        if len(tokens) >= max_seq_length - 1:
            tokens = tokens[0:(max_seq_length - 2)]
            labels = labels[0:(max_seq_length - 2)]
            valid = valid[0:(max_seq_length - 2)]
            label_mask = label_mask[0:(max_seq_length - 2)]
        ntokens = []
        segment_ids = []
        label_ids = []
        ntokens.append("[CLS]")
        segment_ids.append(0)
        valid.insert(0,1)
        label_mask.insert(0,1)
        label_ids.append(label_map["[CLS]"])
        for i, token in enumerate(tokens):
            ntokens.append(token)
            segment_ids.append(0)
            if len(labels) > i:
                label_ids.append(label_map[labels[i]])
        ntokens.append("[SEP]")
        segment_ids.append(0)
        valid.append(1)
        label_mask.append(1)
        label_ids.append(label_map["[SEP]"])
        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
        input_mask = [1] * len(input_ids)
        label_mask = [1] * len(label_ids)
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            label_ids.append(0)
            valid.append(1)
            label_mask.append(0)
        while len(label_ids) < max_seq_length:
            label_ids.append(0)
            label_mask.append(0)
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length
        assert len(valid) == max_seq_length
        assert len(label_mask) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            # logger.info("label: %s (id = %d)" % (example.label, label_ids))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_ids,
                              valid_ids=valid,
                              label_mask=label_mask))
    return features

def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval or not.")
    parser.add_argument("--eval_on",
                        default="dev",
                        help="Whether to run eval on the dev set or test set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay", default=0.01, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"ner":NerProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = 0
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    # Prepare model
    config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name)
    model = Ner.from_pretrained(args.bert_model,
              from_tf = False,
              config = config)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias','LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    label_map = {i : label for i, label in enumerate(label_list,1)}
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        label_map = {i : label for i, label in enumerate(label_list,1)}
        model_config = {"bert_model":args.bert_model,"do_lower":args.do_lower_case,"max_seq_length":args.max_seq_length,"num_labels":len(label_list)+1,"label_map":label_map}
        json.dump(model_config,open(os.path.join(args.output_dir,"model_config.json"),"w"))
        # Load a trained model and config that you have fine-tuned
    else:
        # Load a trained model and vocabulary that you have fine-tuned
        model = Ner.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)

    model.to(device)

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples(args.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        label_map = {i : label for i, label in enumerate(label_list,1)}
        for input_ids, input_mask, segment_ids, label_ids,valid_ids,l_mask in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            valid_ids = valid_ids.to(device)
            label_ids = label_ids.to(device)
            l_mask = l_mask.to(device)

            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask,valid_ids=valid_ids,attention_mask_label=l_mask)

            logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()

            for i, label in enumerate(label_ids):
                temp_1 = []
                temp_2 = []
                for j,m in enumerate(label):
                    if j == 0:
                        continue
                    elif label_ids[i][j] == len(label_map):
                        y_true.append(temp_1)
                        y_pred.append(temp_2)
                        break
                    else:
                        temp_1.append(label_map[label_ids[i][j]])
                        temp_2.append(label_map[logits[i][j]])

        report = classification_report(y_true, y_pred,digits=4)
        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)


if __name__ == "__main__":
    main()


In [None]:
python run_ner.py --data_dir=data/ --bert_model=bert-base-cased --task_name=ner --output_dir=out_base --max_seq_length=128 --do_train --num_train_epochs 5 --do_eval --warmup_proportion=0.1

In [None]:
Let’s define some variables that we need for further pre-processing steps and training the model:

MAX_LENGTH = 120 #@param {type: "integer"}
MODEL = "chriskhanhtran/spanberta" #@param ["chriskhanhtran/spanberta", "bert-base-multilingual-cased"]

The script below will split sentences longer than MAX_LENGTH (in terms of tokens) into small ones. Otherwise, long sentences will be truncated when tokenized, causing the loss of training data and some tokens in the test set not being predicted.

%%capture
!wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"

!python3 preprocess.py train_temp.txt $MODEL $MAX_LENGTH > train.txt
!python3 preprocess.py dev_temp.txt $MODEL $MAX_LENGTH > dev.txt
!python3 preprocess.py test_temp.txt $MODEL $MAX_LENGTH > test.txt

# If your dataset has different labels or more labels than CoNLL-2002/2003 datasets, run the line below to get unique labels from your data and save them into labels.txt. This file will be used when we start fine-tuning our model.

!cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt


# training hyperparameters
MAX_LENGTH = 128 #@param {type: "integer"}
MODEL = "chriskhanhtran/spanberta" #@param ["chriskhanhtran/spanberta", "bert-base-multilingual-cased"]
OUTPUT_DIR = "spanberta-ner" #@param ["spanberta-ner", "bert-base-ml-ner"]
BATCH_SIZE = 32 #@param {type: "integer"}
NUM_EPOCHS = 3 #@param {type: "integer"}
SAVE_STEPS = 100 #@param {type: "integer"}
LOGGING_STEPS = 100 #@param {type: "integer"}
SEED = 42 #@param {type: "integer"}


!python3 run_ner.py \
  --data_dir ./ \
  --model_type bert \
  --labels ./labels.txt \
  --model_name_or_path $MODEL \
  --output_dir $OUTPUT_DIR \
  --max_seq_length  $MAX_LENGTH \
  --num_train_epochs $NUM_EPOCHS \
  --per_gpu_train_batch_size $BATCH_SIZE \
  --save_steps $SAVE_STEPS \
  --logging_steps $LOGGING_STEPS \
  --seed $SEED \
  --do_train \
  --do_eval \
  --do_predict \
  --overwrite_output_dir

In [None]:
python run_ner.py --data_dir=data/vlsp16 --bert_model='/home/phamson/Desktop/phobert-base-135' --task_name=ner --output_dir=out_base --max_seq_length=128 --do_train --num_train_epochs 5 --do_eval --warmup_proportion=0.1

In [None]:
python run_ner.py \
  --model_name_or_path vinai/phobert-base \
  --dataset_name '/home/phamson/transformers/examples/token-classification/vlsp16' \
  --output_dir '/home/phamson/ner-phobert' \
  --do_train \
  --do_eval

In [5]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files = '/home/phamson/transformers/examples/token-classification/vlsp16', delimiter = '\t')

Using custom data configuration default-a396e420bd2e0c4c


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/phamson/.cache/huggingface/datasets/csv/default-a396e420bd2e0c4c/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


IsADirectoryError: [Errno 21] Is a directory: '/home/phamson/transformers/examples/token-classification/vlsp16'

In [39]:
import pandas as pd

In [40]:
import csv

In [41]:
vlsp_train = '/home/phamson/Downloads/VLSP2016-NER/train.txt'
vlsp_test = '/home/phamson/Downloads/VLSP2016-NER/test.txt'
vlsp_dev = '/home/phamson/Downloads/VLSP2016-NER/dev.txt'

In [42]:
v_train = '/home/phamson/Downloads/VLSP2016-NER/train_ner.csv'
v_test = '/home/phamson/Downloads/VLSP2016-NER/test_ner.csv'
v_dev = '/home/phamson/Downloads/VLSP2016-NER/dev_ner.csv'

In [43]:
train = pd.read_csv(vlsp_train, delimiter = '\t', header = None, quotechar="'")
test = pd.read_csv(vlsp_test, delimiter = '\t', header = None, quotechar="'")
dev = pd.read_csv(vlsp_dev, delimiter = '\t', header = None, quotechar="'")

In [14]:
# df = pd.read_csv(vlsp, delimiter = '\t', header=None)

In [24]:
# vlsp_ner = df.drop(df.columns[[2,1]], axis = 1)

In [26]:
# df.to_csv('/home/phamson/Downloads/VLSP2016-NER/train_ner.txt', sep='\t', columns=[0,3], header=False, index=False)

In [44]:
train.to_csv('/home/phamson/Downloads/VLSP2016-NER/train_ner.csv', sep='\t', columns=[0,3], header=False, index=False)
test.to_csv('/home/phamson/Downloads/VLSP2016-NER/test_ner.csv', sep='\t', columns=[0,3], header=False, index=False)
dev.to_csv('/home/phamson/Downloads/VLSP2016-NER/dev_ner.csv', sep='\t', columns=[0,3], header=False, index=False)

In [None]:
with open('/home/phamson/Downloads/VLSP2016-NER/train.txt') as file:
    csv_reader = csv.reader(file, delimiter='\t')
    

In [None]:
import csv
with open(vlsp_train,"rb") as source:
    rdr= csv.reader( source )
    with open(v_train,"wb") as result:
        wtr= csv.writer( result )
        for r in rdr:
            wtr.writerow( (r[0], r[1], r[3], r[4]) )

In [None]:
task = 'ner'
model_checkpoint = ''
batch_size = 16


In [33]:
!python run_ner.py \
  --model_name_or_path vinai/phobert-base \
  --train_file /home/phamson/Downloads/VLSP2016-NER/ner/train.csv \
  --validation_file /home/phamson/Downloads/VLSP2016-NER/ner/dev.csv \
  --output_dir /output \
  --do_train \
  --do_eval

python: can't open file '/home/phamson/jupyter-notebook/run_ner.py': [Errno 2] No such file or directory


In [39]:
def preprocess_conll(text, sep="\t"):
    """
    Converts data in CoNLL format to word and label lists.
    Args:
        text (str): Text string in conll format, e.g.
            "Amy B-PER
             ADAMS I-PER
             works O
             at O
             the O
             University B-ORG
             of I-ORG
             Minnesota I-ORG
             . O"
        sep (str, optional): Column separator
            Defaults to \t
    Returns:
        tuple:
            (list of word lists, list of token label lists)
    """
    text_list = text.split("\n\n")
    if text_list[-1] in (" ", ""):
        text_list = text_list[:-1]

    max_seq_len = 0
    sentence_list = []
    labels_list = []
    for s in text_list:
        # split each sentence string into "word label" pairs
        s_split = s.split("\n")
        # split "word label" pairs
        s_split_split = [t.split(sep) for t in s_split]
        sentence_list.append([t[0] for t in s_split_split if len(t) > 1])
        labels_list.append([t[1] for t in s_split_split if len(t) > 1])
        if len(s_split_split) > max_seq_len:
            max_seq_len = len(s_split_split)
    print("Maximum sequence length is: {0}".format(max_seq_len))
    return sentence_list, labels_list


def read_conll_file(file_path, sep="\t", encoding=None):
    """
    Reads a data file in CoNLL format and returns word and label lists.
    Args:
        file_path (str): Data file path.
        sep (str, optional): Column separator. Defaults to "\t".
        encoding (str): File encoding used when reading the file.
            Defaults to None.
    Returns:
        (list, list): A tuple of word and label lists (list of lists).
    """
    with open(file_path, encoding=encoding) as f:
        data = f.read()
    return preprocess_conll(data, sep=sep)

In [41]:
read_conll_file('/home/phamson/transformers/examples/token-classification/vlsp16/dev.csv')

Maximum sequence length is: 45706


([['Người',
   'cầm',
   'thư',
   'đã',
   'chết',
   'còn',
   'người',
   'nhận',
   'thư',
   'thì',
   'bị',
   'bắt',
   '!',
   '',
   'Biết_bao_nhiêu',
   'bà',
   'mẹ',
   'như',
   'mẹ',
   'Đường',
   'sẽ',
   'còn',
   'đau_khổ',
   'khóc_than',
   'đến',
   'cạn',
   'dòng',
   'nước_mắt',
   '.',
   '',
   'Ôi',
   'nếu',
   'mình',
   'ngã',
   'xuống',
   ',',
   'mẹ',
   'mình',
   'cũng',
   'sẽ',
   'như',
   'bà',
   'mẹ',
   'ấy',
   'thôi',
   ',',
   'cũng',
   'sẽ',
   'là',
   'một',
   'bà',
   'mẹ',
   'suốt',
   'đời',
   'hi_sinh',
   'vì',
   'con',
   'để',
   'rồi',
   'mãi_mãi',
   'đau_xót',
   'vì',
   'con',
   'mình',
   'đã',
   'ngã',
   'xuống',
   'nơi',
   'chiến_trường',
   'khói_lửa',
   '.',
   '',
   'Mẹ',
   'ơi',
   '!',
   '',
   'Con',
   'biết',
   'nói',
   'sao',
   'khi',
   'lòng',
   'con',
   'thương',
   'mẹ',
   'trăm',
   'nghìn',
   'triệu',
   'mà',
   'cũng',
   'đành',
   'xa',
   'mẹ',
   'ra',
   'đi',
   '.',
   '',
   

In [None]:
python run_ner.py --data_dir=data/vlsp16 --bert_model='/home/phamson/Desktop/phobert-base-135' --task_name=ner --output_dir=out_base --max_seq_length=128 --do_train --num_train_epochs 5 --do_eval --warmup_proportion=0.1

In [None]:
from transformers import AutoTokenizer, TFAutoModelForTokenClassification
  
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-xlm-r-ner-40-lang")

model = TFAutoModelForTokenClassification.from_pretrained("jplu/tf-xlm-r-ner-40-lang")

In [42]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

NameError: name 'trainer' is not defined

In [2]:
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP(address="http://127.0.0.1", port=9000) 

# Input 
text = "Bán các lô biệt thự đẳng cấp dự án Louis City Hoàng Mai - khu đô thị gần phố cổ tại Hà Nội. \
Diện tích đất: 270m2 - 290m2, lô góc 300m2 - 310m2. \
Diện tích xây dựng: 65%. \
Xây 4 tầng 1 tum, 1 hầm để xe. \
Hướng chính: Đông, Tây, Nam, Bắc. \
Vị trí lô góc, 2 mặt tiền và một mặt view công viên trung tâm vườn hoa. \
Đường vào 40m, đường 22,5m và mặt đường 13,5m. \
Thiết kế theo phong cách tân cổ điển, phong cách kiến trúc kiểu Ý. \
Vị trí mặt đường Tân Mai đi lại vô cùng thuận tiện. \
Gần 2 hồ lớn: Hồ Đền Lừ, hồ Yên Sở và công viên Yên Sở. \
Có đầy đủ các tiện ích cao cấp: Trường học quốc tế liên cấp 1 - 2 - 3, trường mầm non, trường trung học cơ sở, trung tâm thương mại, bãi đỗ xe, công viên, bể bơi trong nhà... \
Giá đất: 89 triệu/m2. \
Giá xây dựng: 7,2 triệu/m2. \
Chính sách bán hàng: Chiết khấu 7%, vay ngân hàng 0% trong 18 tháng."


In [28]:
ner = annotator.ner(text)
result = []
for sentence in ner:
    entity = ''
    for key, value in sentence:
#         if value == "O":
#             continue
#         else if va:
#             entity = key
#             entity += key
#         result.append(key)
        if value.startswith('B'):
            entity = key
            continue
        if value.startswith('I') and entity != '':
            entity += key
        if entity != '' and entity not in result:
            result.append(entity)


In [29]:
result

['Hà_Nội',
 'Nam',
 'Bắc',
 'đườngTân_Mai',
 'HồĐền_Lừ',
 'hồYên_Sở',
 'công_viênYên_Sở']

In [None]:
keys = ['B-LOC', 'I-LOC']

for sentence in ner:
    for key, value in sentence:
        myDictionary.get(key)

In [72]:
ner

[[('Bán', 'O'),
  ('các', 'O'),
  ('lô', 'O'),
  ('biệt_thự', 'O'),
  ('đẳng_cấp', 'O'),
  ('dự_án', 'O'),
  ('Louis_City_Hoàng_Mai', 'O'),
  ('-', 'O'),
  ('khu', 'O'),
  ('đô_thị', 'O'),
  ('gần', 'O'),
  ('phố', 'O'),
  ('cổ', 'O'),
  ('tại', 'O'),
  ('Hà_Nội', 'B-LOC'),
  ('.', 'O')],
 [('Diện_tích', 'O'),
  ('đất', 'O'),
  (':', 'O'),
  ('270m2', 'O'),
  ('-', 'O'),
  ('290m2', 'O'),
  (',', 'O'),
  ('lô', 'O'),
  ('góc', 'O'),
  ('300m2', 'O'),
  ('-', 'O'),
  ('310', 'O'),
  ('m2', 'O'),
  ('.', 'O')],
 [('Diện_tích', 'O'), ('xây_dựng', 'O'), (':', 'O')],
 [('65%', 'O'),
  ('.', 'O'),
  ('Xây', 'O'),
  ('4', 'O'),
  ('tầng', 'O'),
  ('1', 'O'),
  ('tum', 'O'),
  (',', 'O'),
  ('1', 'O'),
  ('hầm', 'O'),
  ('để', 'O'),
  ('xe', 'O'),
  ('.', 'O')],
 [('Hướng', 'O'), ('chính', 'O'), (':', 'O')],
 [('Đông', 'O'),
  (',', 'O'),
  ('Tây', 'O'),
  (',', 'O'),
  ('Nam', 'B-LOC'),
  (',', 'O'),
  ('Bắc', 'B-LOC'),
  ('.', 'O')],
 [('Vị_trí', 'O'),
  ('lô', 'O'),
  ('góc', 'O'),
  (',', 

In [80]:
result

['đườngTân_Mai', 'HồĐền_Lừ', 'hồYên_Sở', 'công_viênYên_Sở']

In [55]:
annotator.ner(text)

[[('Ông', 'O'),
  ('Nguyễn_Khắc_Chúc', 'B-PER'),
  ('đang', 'O'),
  ('làm_việc', 'O'),
  ('tại', 'O'),
  ('Đại_học', 'B-ORG'),
  ('Quốc_gia', 'I-ORG'),
  ('Hà_Nội', 'I-ORG'),
  ('.', 'O')],
 [('Bà', 'O'),
  ('Lan', 'B-PER'),
  (',', 'O'),
  ('vợ', 'O'),
  ('ông', 'O'),
  ('Chúc', 'B-PER'),
  (',', 'O'),
  ('cũng', 'O'),
  ('làm_việc', 'O'),
  ('tại', 'O'),
  ('đây', 'O'),
  ('.', 'O')]]

In [53]:
import logging

from vncorenlp import VnCoreNLP


def simple_usage():
    # Uncomment this line for debugging
    # logging.basicConfig(level=logging.DEBUG)

    vncorenlp_file = r'/home/phamson/VnCoreNLP/VnCoreNLP-1.1.1.jar'

    sentences = 'VTV đồng ý chia sẻ bản quyền World Cup 2018 cho HTV để khai thác. ' \
                'Nhưng cả hai nhà đài đều phải chờ sự đồng ý của FIFA mới thực hiện được điều này.'

    # Use "with ... as" to close the server automatically
    with VnCoreNLP(vncorenlp_file) as vncorenlp:
        print('Tokenizing:', vncorenlp.tokenize(sentences))
        print('POS Tagging:', vncorenlp.pos_tag(sentences))
        print('Named-Entity Recognizing:', vncorenlp.ner(sentences))
        print('Dependency Parsing:', vncorenlp.dep_parse(sentences))
        print('Annotating:', vncorenlp.annotate(sentences))
        print('Language:', vncorenlp.detect_language(sentences))

    # In this way, you have to close the server manually by calling close function
    vncorenlp = VnCoreNLP(vncorenlp_file)

    print('Tokenizing:', vncorenlp.tokenize(sentences))
    print('POS Tagging:', vncorenlp.pos_tag(sentences))
    print('Named-Entity Recognizing:', vncorenlp.ner(sentences))
    print('Dependency Parsing:', vncorenlp.dep_parse(sentences))
    print('Annotating:', vncorenlp.annotate(sentences))
    print('Language:', vncorenlp.detect_language(sentences))

    # Do not forget to close the server
    vncorenlp.close()


if __name__ == '__main__':
    simple_usage()

Tokenizing: [['VTV', 'đồng_ý', 'chia_sẻ', 'bản_quyền', 'World_Cup', '2018', 'cho', 'HTV', 'để', 'khai_thác', '.'], ['Nhưng', 'cả', 'hai', 'nhà', 'đài', 'đều', 'phải', 'chờ', 'sự', 'đồng_ý', 'của', 'FIFA', 'mới', 'thực_hiện', 'được', 'điều', 'này', '.']]
POS Tagging: [[('VTV', 'Ny'), ('đồng_ý', 'V'), ('chia_sẻ', 'V'), ('bản_quyền', 'N'), ('World_Cup', 'N'), ('2018', 'M'), ('cho', 'E'), ('HTV', 'Ny'), ('để', 'E'), ('khai_thác', 'V'), ('.', 'CH')], [('Nhưng', 'C'), ('cả', 'P'), ('hai', 'M'), ('nhà', 'N'), ('đài', 'N'), ('đều', 'R'), ('phải', 'V'), ('chờ', 'V'), ('sự', 'Nc'), ('đồng_ý', 'V'), ('của', 'E'), ('FIFA', 'Np'), ('mới', 'R'), ('thực_hiện', 'V'), ('được', 'R'), ('điều', 'N'), ('này', 'P'), ('.', 'CH')]]
Named-Entity Recognizing: [[('VTV', 'B-ORG'), ('đồng_ý', 'O'), ('chia_sẻ', 'O'), ('bản_quyền', 'O'), ('World_Cup', 'O'), ('2018', 'O'), ('cho', 'O'), ('HTV', 'O'), ('để', 'O'), ('khai_thác', 'O'), ('.', 'O')], [('Nhưng', 'O'), ('cả', 'O'), ('hai', 'O'), ('nhà', 'O'), ('đài', 'O'), 

Language: vi


In [49]:
annotator.ner(text) 

AssertionError: Please ensure that the annotators "wseg,pos,ner" are being used on the server.

In [45]:
annotated_text

{'sentences': [[{'index': 1, 'form': 'Ông', 'nerLabel': 'O', 'head': -1},
   {'index': 2, 'form': 'Nguyễn_Khắc_Chúc', 'nerLabel': 'B-PER', 'head': -1},
   {'index': 3, 'form': 'đang', 'nerLabel': 'O', 'head': -1},
   {'index': 4, 'form': 'làm_việc', 'nerLabel': 'O', 'head': -1},
   {'index': 5, 'form': 'tại', 'nerLabel': 'O', 'head': -1},
   {'index': 6, 'form': 'Đại_học', 'nerLabel': 'B-ORG', 'head': -1},
   {'index': 7, 'form': 'Quốc_gia', 'nerLabel': 'I-ORG', 'head': -1},
   {'index': 8, 'form': 'Hà_Nội', 'nerLabel': 'I-ORG', 'head': -1},
   {'index': 9, 'form': '.', 'nerLabel': 'O', 'head': -1}],
  [{'index': 1, 'form': 'Bà', 'nerLabel': 'O', 'head': -1},
   {'index': 2, 'form': 'Lan', 'nerLabel': 'B-PER', 'head': -1},
   {'index': 3, 'form': ',', 'nerLabel': 'O', 'head': -1},
   {'index': 4, 'form': 'vợ', 'nerLabel': 'O', 'head': -1},
   {'index': 5, 'form': 'ông', 'nerLabel': 'O', 'head': -1},
   {'index': 6, 'form': 'Chúc', 'nerLabel': 'O', 'head': -1},
   {'index': 7, 'form': '

In [7]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('csv', datat_files={'train':['']})

In [31]:
import csv

In [32]:
file ='file:///home/phamson/transformers/examples/token-classification/vlsp16/dev.csv'


In [35]:
csv.reader(file, quotechar="'", delimiter='\t')

<_csv.reader at 0x7feacbf4ef20>

In [None]:
pd.read_csv(file, )

In [45]:
import sys

from transformers import AutoTokenizer

dataset = sys.argv[1]
model_name_or_path = sys.argv[2]
max_len = int(sys.argv[3])

subword_len_counter = 0

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
max_len -= tokenizer.num_special_tokens_to_add()

with open(dataset, "rt") as f_p:
    for line in f_p:
        line = line.rstrip()

        if not line:
            print(line)
            subword_len_counter = 0
            continue

        token = line.split()[0]

        current_subwords_len = len(tokenizer.tokenize(token))

        # Token contains strange control characters like \x96 or \x95
        # Just filter out the complete line
        if current_subwords_len == 0:
            continue

        if (subword_len_counter + current_subwords_len) > max_len:
            print("")
            print(line)
            subword_len_counter = current_subwords_len
            continue

        subword_len_counter += current_subwords_len

        print(line)

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


IndexError: list index out of range

In [7]:
# cat '/home/phamson/data/train.txt' | cut -f 1,4 > train.txt
# cat '/home/phamson/data/test.txt' | cut -f 1,4 > test.txt
# cat '/home/phamson/data/dev.txt' | cut -f 1,4 > dev.txt

cut: the delimiter must be a single character
Try 'cut --help' for more information.
cat: write error: Broken pipe
cut: the delimiter must be a single character
Try 'cut --help' for more information.
cat: write error: Broken pipe
cut: the delimiter must be a single character
Try 'cut --help' for more information.
cat: write error: Broken pipe


In [3]:
from datasets import load_dataset

In [8]:
dataset = load_dataset('csv', delimiter='\t', data_files={'train': '/home/phamson/data/test/train.txt','test': '/home/phamson/data/test/test.txt'})


Using custom data configuration default-b0bfa6f8e8895292


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/phamson/.cache/huggingface/datasets/csv/default-b0bfa6f8e8895292/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset csv downloaded and prepared to /home/phamson/.cache/huggingface/datasets/csv/default-b0bfa6f8e8895292/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0. Subsequent calls will reuse this data.


ValueError: External features info don't match the dataset:
Got
{'Chị': Value(dtype='string', id=None), 'O': Value(dtype='string', id=None)}
with type
struct<Chị: string, O: string>

but expected something like
{'Đó': Value(dtype='string', id=None), 'O': Value(dtype='string', id=None)}
with type
struct<O: string, Đó: string>

In [11]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_wnut('/home/phamson/data/test/train.txt')

In [13]:
print(texts[0][10:17], tags[0][10:17], sep='\n')

['Ấn_Độ_Dương', 'sang', 'Thái_Bình_Dương', ',', 'chiếm', 'đến', 'lượng']
['B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O']
