In [4]:
import logging
from pathlib import Path

from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import NERProcessor
from farm.modeling.optimization import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TokenClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

In [8]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_ner")

set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 4
batch_size = 32
evaluate_every = 400
lang_model = "electra-base"
do_lower_case = False




 __          __  _                            _        
 \ \        / / | |                          | |       
  \ \  /\  / /__| | ___ ___  _ __ ___   ___  | |_ ___  
   \ \/  \/ / _ \ |/ __/ _ \| '_ ` _ \ / _ \ | __/ _ \ 
    \  /\  /  __/ | (_| (_) | | | | | |  __/ | || (_) |
     \/  \/ \___|_|\___\___/|_| |_| |_|\___|  \__\___/ 
  ______      _____  __  __  
 |  ____/\   |  __ \|  \/  |              _.-^-._    .--.
 | |__ /  \  | |__) | \  / |           .-'   _   '-. |__|
 |  __/ /\ \ |  _  /| |\/| |          /     |_|     \|  |
 | | / ____ \| | \ \| |  | |         /               \  |
 |_|/_/    \_\_|  \_\_|  |_|        /|     _____     |\ |
                                     |    |==|==|    |  |
|---||---|---|---|---|---|---|---|---|    |--|--|    |  |
|---||---|---|---|---|---|---|---|---|    |==|==|    |  |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 


09/03/2020 17:43:53 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None


In [10]:


# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path="google/electra-base-discriminator", do_lower_case=do_lower_case
)



09/03/2020 17:45:53 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'ElectraTokenizer'
09/03/2020 17:46:02 - INFO - filelock -   Lock 5701365432 acquired on /Users/subir/.cache/torch/transformers/ff085885d4c95651587af553adadd34a26de8a663f2cef709635b48b3bed2bbd.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
09/03/2020 17:46:02 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt not found in cache or force_download set to True, downloading to /Users/subir/.cache/torch/transformers/tmpdif7gc0k


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

09/03/2020 17:46:05 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt in cache at /Users/subir/.cache/torch/transformers/ff085885d4c95651587af553adadd34a26de8a663f2cef709635b48b3bed2bbd.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
09/03/2020 17:46:05 - INFO - transformers.file_utils -   creating metadata file for /Users/subir/.cache/torch/transformers/ff085885d4c95651587af553adadd34a26de8a663f2cef709635b48b3bed2bbd.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
09/03/2020 17:46:05 - INFO - filelock -   Lock 5701365432 released on /Users/subir/.cache/torch/transformers/ff085885d4c95651587af553adadd34a26de8a663f2cef709635b48b3bed2bbd.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
09/03/2020 17:46:05 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discri




In [15]:

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]

processor = NERProcessor(
    tokenizer=tokenizer, max_seq_len=128, data_dir=Path("data//finetuning_data//chunk"), delimiter=" ", metric="seq_f1", label_list=ner_labels
)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1, distributed=False)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => NER
prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

09/03/2020 17:57:19 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
09/03/2020 17:57:19 - INFO - farm.data_handler.data_silo -   Loading train set from: data/finetuning_data/chunk/train.txt 
09/03/2020 17:57:19 - INFO - farm.data_handler.data_silo -   Multiprocessing disabled, using a single worker to convert 8323dictionaries to pytorch datasets.
Preprocessing Dataset data/finetuning_data/chunk/train.txt:   0%|          | 0/8323 [00:00<?, ? Dicts/s]09/03/2020 17:57:36 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
09/03/2020 17:57:36 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
 

Preprocessing Dataset data/finetuning_data/chunk/train.txt:  20%|██        | 1665/8323 [00:16<01:07, 98.50 Dicts/s]
09/03/2020 17:57:36 - INFO - farm.data_handler.data_silo -   Loading dev set from: data/finetuning_data/chunk/dev.txt
09/03/2020 17:57:36 - INFO - farm.data_handler.data_silo -   Multiprocessing disabled, using a single worker to convert 1915dictionaries to pytorch datasets.
Preprocessing Dataset data/finetuning_data/chunk/dev.txt:   0%|          | 0/1915 [00:00<?, ? Dicts/s]09/03/2020 17:57:39 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
09/03/2020 17:57:39 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                 

Preprocessing Dataset data/finetuning_data/chunk/test.txt:  20%|██        | 304/1517 [00:02<00:11, 101.34 Dicts/s]
09/03/2020 17:57:42 - INFO - farm.data_handler.data_silo -   Examples in train: 8323
09/03/2020 17:57:42 - INFO - farm.data_handler.data_silo -   Examples in dev  : 1915
09/03/2020 17:57:42 - INFO - farm.data_handler.data_silo -   Examples in test : 1517
09/03/2020 17:57:42 - INFO - farm.data_handler.data_silo -   
09/03/2020 17:57:42 - INFO - farm.data_handler.data_silo -   Longest sequence length observed after clipping:     128
09/03/2020 17:57:42 - INFO - farm.data_handler.data_silo -   Average sequence length after clipping: 48.31575153189956
09/03/2020 17:57:42 - INFO - farm.data_handler.data_silo -   Proportion clipped:      0.005046257359125316


OSError: Can't load config for 'electra-base'. Make sure that:

- 'electra-base' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'electra-base' is the correct path to a directory containing a config.json file



In [7]:


    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = "saved_models/bert-german-ner-tutorial"
    model.save(save_dir)
    processor.save(save_dir)


    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "Where do Mr smith want to go for lunch?"},
        {"text": "Martin muller is a poet from  Berlin"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)

    model.close_multiprocessing_pool()


if __name__ == "__main__":
    ner()


 __          __  _                            _        
 \ \        / / | |                          | |       
  \ \  /\  / /__| | ___ ___  _ __ ___   ___  | |_ ___  
   \ \/  \/ / _ \ |/ __/ _ \| '_ ` _ \ / _ \ | __/ _ \ 
    \  /\  /  __/ | (_| (_) | | | | | |  __/ | || (_) |
     \/  \/ \___|_|\___\___/|_| |_| |_|\___|  \__\___/ 
  ______      _____  __  __  
 |  ____/\   |  __ \|  \/  |              _.-^-._    .--.
 | |__ /  \  | |__) | \  / |           .-'   _   '-. |__|
 |  __/ /\ \ |  _  /| |\/| |          /     |_|     \|  |
 | | / ____ \| | \ \| |  | |         /               \  |
 |_|/_/    \_\_|  \_\_|  |_|        /|     _____     |\ |
                                     |    |==|==|    |  |
|---||---|---|---|---|---|---|---|---|    |--|--|    |  |
|---||---|---|---|---|---|---|---|---|    |==|==|    |  |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 


09/03/2020 17:35:18 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
09/03/2020 17:35:18 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'
09/03/2020 17:35:19 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/subir/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
09/03/2020 17:35:20 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
09/03/2020 17:35:20 - INFO - farm.data_handler.data_silo -   Loading train set from: data/finetuning_data/chunk/train.txt 
09/03/2020 17:35:21 - INFO - farm.data_handler.data

Preprocessing Dataset data/finetuning_data/chunk/train.txt: 100%|██████████| 8323/8323 [00:08<00:00, 1016.77 Dicts/s]
09/03/2020 17:35:29 - INFO - farm.data_handler.data_silo -   Loading dev set from: data/finetuning_data/chunk/dev.txt
09/03/2020 17:35:29 - INFO - farm.data_handler.data_silo -   Got ya 3 parallel workers to convert 1915 dictionaries to pytorch datasets (chunksize = 128)...
09/03/2020 17:35:29 - INFO - farm.data_handler.data_silo -    0    0    0 
09/03/2020 17:35:29 - INFO - farm.data_handler.data_silo -   /w\  /w\  /w\
09/03/2020 17:35:29 - INFO - farm.data_handler.data_silo -   /'\  / \  / \
09/03/2020 17:35:29 - INFO - farm.data_handler.data_silo -       
Preprocessing Dataset data/finetuning_data/chunk/dev.txt:   0%|          | 0/1915 [00:00<?, ? Dicts/s]09/03/2020 17:35:29 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
09/03/2020 17:35:29 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .

09/03/2020 17:35:32 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: 38-0
Clear Text: 
 	text: En toda Castilla-La Mancha la pérdida superaría los 8.000 millones tras haberse incrementado el gasóleo agrícola desde 38 pesetas el litro en enero de 1999 a 66 pesetas en la actualidad .
 	ner_label: ['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Tokenized: 
 	tokens: ['[UNK]', 'tod', '##a', '[UNK]', '-', '[UNK]', '[UNK]', 'la',

Preprocessing Dataset data/finetuning_data/chunk/test.txt: 100%|██████████| 1517/1517 [00:01<00:00, 779.19 Dicts/s]
09/03/2020 17:35:33 - INFO - farm.data_handler.data_silo -   Examples in train: 8323
09/03/2020 17:35:33 - INFO - farm.data_handler.data_silo -   Examples in dev  : 1915
09/03/2020 17:35:33 - INFO - farm.data_handler.data_silo -   Examples in test : 1517
09/03/2020 17:35:33 - INFO - farm.data_handler.data_silo -   
09/03/2020 17:35:33 - INFO - farm.data_handler.data_silo -   Longest sequence length observed after clipping:     128
09/03/2020 17:35:33 - INFO - farm.data_handler.data_silo -   Average sequence length after clipping: 48.31575153189956
09/03/2020 17:35:33 - INFO - farm.data_handler.data_silo -   Proportion clipped:      0.005046257359125316
09/03/2020 17:35:37 - INFO - filelock -   Lock 5702682888 acquired on /Users/subir/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeeb

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…

09/03/2020 17:38:11 - INFO - transformers.file_utils -   storing https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin in cache at /Users/subir/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
09/03/2020 17:38:11 - INFO - transformers.file_utils -   creating metadata file for /Users/subir/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
09/03/2020 17:38:11 - INFO - filelock -   Lock 5702682888 released on /Users/subir/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157.lock
09/03/2020 17:38:11 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at /Users/subir/.cache/torch/transform




09/03/2020 17:38:13 - INFO - transformers.modeling_utils -   All model checkpoint weights were used when initializing BertModel.

09/03/2020 17:38:13 - INFO - transformers.modeling_utils -   All the weights of BertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use BertModel for predictions without further training.
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
09/03/2020 17:38:13 - INFO - farm.modeling.prediction_head -   Prediction head initialized with size [768, 13]
09/03/2020 17:38:27 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 1e-05}'
09/03/2020 17:38:29 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
09/03/2020 17:38:29 - INFO - farm.modeling.optimization -   Loading sched

KeyboardInterrupt: 