In [10]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs



In [11]:
texts_train, tags_train = read_wnut('/home/phamson/data/test/train.txt')

In [12]:
texts_test, tags_test = read_wnut('/home/phamson/data/test/test.txt')

In [None]:
# Now that we’ve read the data in, let’s create a train/validation split:


# from sklearn.model_selection import train_test_split
# train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)


In [6]:
# Next, let’s create encodings for our tokens and tags. For the tags, we can start by just create a simple mapping which we’ll use in a moment:

unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}



<!-- To encode the tokens, we’ll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we’re dealing with ready-split tokens rather than full sentence strings by passing is_split_into_words=True. We’ll also pass padding=True and truncation=True to pad the sequences to be the same length. Lastly, we can tell the model to return information about the tokens which are split by the wordpiece tokenization process, which we will need in a moment. -->

In [8]:
import torch
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained("vinai/phobert-base")

# For transformers v4.x+: 
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
train_encodings = tokenizer(texts_train, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
test_encodings = tokenizer(texts_test, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers.To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast.

In [None]:
/home/phamson/data/test/train.txt
/home/phamson/data/test/test.txt

In [None]:
import sys

from transformers import AutoTokenizer


######
dataset = '/home/phamson/data/test/train.txt'
model_name_or_path = 'vinai/phobert-base'
max_len = 128
######


subword_len_counter = 0

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
max_len -= tokenizer.num_special_tokens_to_add()

with open(dataset, "rt") as f_p:
    for line in f_p:
        line = line.rstrip()

        if not line:
            print(line)
            subword_len_counter = 0
            continue

        token = line.split()[0]

        current_subwords_len = len(tokenizer.tokenize(token))

        # Token contains strange control characters like \x96 or \x95
        # Just filter out the complete line
        if current_subwords_len == 0:
            continue

        if (subword_len_counter + current_subwords_len) > max_len:
            print("")
            print(line)
            subword_len_counter = current_subwords_len
            continue

        subword_len_counter += current_subwords_len

        print(line)

In [14]:
! wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"

# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
# use tokenizer by phobert


--2021-04-23 16:33:17--  https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 991 [text/plain]
Saving to: ‘preprocess.py’


2021-04-23 16:33:17 (22,3 MB/s) - ‘preprocess.py’ saved [991/991]



In [17]:
! python preprocess.py /home/phamson/data/test/train.txt $BERT_MODEL $MAX_LENGTH > /home/phamson/data/test/train_.txt
! python preprocess.py /home/phamson/data/test/test.txt $BERT_MODEL $MAX_LENGTH > /home/phamson/data/test/test_.txt
! python preprocess.py /home/phamson/data/test/dev.txt $BERT_MODEL $MAX_LENGTH > /home/phamson/data/test/dev_.txt

2021-04-23 16:35:10.299392: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-04-23 16:35:10.299410: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2021-04-23 16:35:22.485085: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-04-23 16:35:22.485103: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned 

In [18]:
%env OUTPUT_DIR=roberta
%env BATCH_SIZE=32
%env NUM_EPOCHS=2
%env SAVE_STEPS=750
%env SEED=1

env: OUTPUT_DIR=roberta
env: BATCH_SIZE=32
env: NUM_EPOCHS=2
env: SAVE_STEPS=750
env: SEED=1


In [29]:
! python /kaggle/working/transformers/examples/ner/run_ner.py 
--data_dir /kaggle/working/JNLPBA/ \
--model_type roberta \
--labels /kaggle/working/JNLPBA/labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_gpu_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict \
--overwrite_output_dir \
--evaluate_during_training \
--logging_steps 4000

SyntaxError: invalid syntax (<ipython-input-29-3700890d762d>, line 3)

In [24]:
%env MAX_LENGTH=128
%env BERT_MODEL=vinai/phobert-base

env: MAX_LENGTH=128
env: BERT_MODEL=vinai/phobert-base


In [25]:
%env OUTPUT_DIR=phobert
%env BATCH_SIZE=32
%env NUM_EPOCHS=3
%env SAVE_STEPS=750
%env SEED=1

env: OUTPUT_DIR=phobert
env: BATCH_SIZE=32
env: NUM_EPOCHS=3
env: SAVE_STEPS=750
env: SEED=1


In [None]:
export MAX_LENGTH=128
export BERT_MODEL=vinai/phobert-base
export OUTPUT_DIR=phobert-ner
export BATCH_SIZE=32
export NUM_EPOCHS=3
export SAVE_STEPS=750
export SEED=1

In [28]:
!python3 'home/phamson/transformers/examples/legacy/token-classification/run_ner.py' \
--data_dir /home/phamson/data/tokenized/ \
--model_name_or_path 'vinai/phobert-base' \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict

SyntaxError: invalid syntax (<ipython-input-28-34472a24dd65>, line 3)