# BBC Article Genre Classification with BERT using the FARM Framework

## Setup

In [None]:
# !pip install farm==0.4.3

In [None]:
!git clone https://github.com/guggio/bbc_news

In [67]:
from transformers import AutoTokenizer
from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import TextClassificationProcessor
from farm.modeling.optimization import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import MultiLabelTextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
import logging
import pandas as pd

In [68]:
# Farm allows simple logging of many parameters & metrics. Let's use the MLflow framework to track our experiment ...
# You will see your results on https://public-mlflow.deepset.ai/

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Crypto_Tweet_Bert", run_name="Crypto Tweet BERT")


 __          __  _                            _        
 \ \        / / | |                          | |       
  \ \  /\  / /__| | ___ ___  _ __ ___   ___  | |_ ___  
   \ \/  \/ / _ \ |/ __/ _ \| '_ ` _ \ / _ \ | __/ _ \ 
    \  /\  /  __/ | (_| (_) | | | | | |  __/ | || (_) |
     \/  \/ \___|_|\___\___/|_| |_| |_|\___|  \__\___/ 
  ______      _____  __  __  
 |  ____/\   |  __ \|  \/  |              _.-^-._    .--.
 | |__ /  \  | |__) | \  / |           .-'   _   '-. |__|
 |  __/ /\ \ |  _  /| |\/| |          /     |_|     \|  |
 | | / ____ \| | \ \| |  | |         /               \  |
 |_|/_/    \_\_|  \_\_|  |_|        /|     _____     |\ |
                                     |    |==|==|    |  |
|---||---|---|---|---|---|---|---|---|    |--|--|    |  |
|---||---|---|---|---|---|---|---|---|    |==|==|    |  |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 


In [69]:
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 2
batch_size = 8
evaluate_every = 100

08/17/2021 17:51:09 - INFO - farm.utils -   Using device: CPU 
08/17/2021 17:51:09 - INFO - farm.utils -   Number of GPUs: 0
08/17/2021 17:51:09 - INFO - farm.utils -   Distributed Training: False
08/17/2021 17:51:09 - INFO - farm.utils -   Automatic Mixed Precision: None


## Building own blocks

### Tokenizer

In [70]:
lang_model = "bert-base-cased"
do_lower_case = False

tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=do_lower_case)

# lang_model = "vinai/bertweet-base"
# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=True)


08/17/2021 17:51:11 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'


### Data Processor

In [71]:
data_dir = "/Users/edwarddavies/Git/Trade_with_Twitter/data/fine_tune_bert"
label_list = ['0', '2', '4'] #labels in our data set
metric = "f1_macro" # desired metric for evaluation

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=512, # BERT can only handle sequence lengths of up to 512
                                            data_dir=data_dir,
                                            label_list=label_list,
                                            label_column_name="sentiment", # our labels are located in the "genre" column
                                            text_column_name="tweet",
                                            metric=metric,
                                            quote_char='"',
                                            multilabel=True,
                                            delimiter=',',
                                            train_filename="train.csv",
                                            dev_filename=None,
                                            test_filename="test.csv",
                                            dev_split=0.1 # this will extract 10% of the train set to create a dev set
                                            )



In [72]:
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)

08/17/2021 17:51:13 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
08/17/2021 17:51:13 - INFO - farm.data_handler.data_silo -   LOADING TRAIN DATA
08/17/2021 17:51:13 - INFO - farm.data_handler.data_silo -   Loading train set from: /Users/edwarddavies/Git/Trade_with_Twitter/data/fine_tune_bert/train.csv 
08/17/2021 17:51:21 - INFO - farm.data_handler.data_silo -   Got ya 7 parallel workers to convert 1279999 dictionaries to pytorch datasets (chunksize = 2000)...
08/17/2021 17:51:21 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0 
08/17/2021 17:51:21 - INFO - farm.data_handler.data_silo -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
08/17/2021 17:51:21 - INFO - farm.data_handler.data_silo -   /'\  / \  /'\  /'\  / \  / \  /'\
08/17/2021 17:51:21 - INFO - farm.data_handler.data_silo -               

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preprocessing Dataset /Users/edwarddavies/Git/Trade_with_Twitter/data/fine_tune_bert/train.csv: 100%|██████████| 1279999/1279999 [03:20<00:00, 6388.71 Dicts/s] 
08/17/2021 17:54:42 - INFO - farm.data_handler.data_silo -   
08/17/2021 17:54:42 - INFO - farm.data_handler.data_silo -   LOADING DEV DATA
08/17/2021 17:54:42 - INFO - farm.data_handler.data_silo -   Loading dev set as a slice of train set
08/17/2021 17:54:42 - INFO - farm.data_handler.data_silo -   Took 127999 samples out of train set to create dev set (dev split is roughly 0.1)
08/17/2021 17:54:42 - INFO - farm.data_handler.data_silo -   
08/17/2021 17:54:42 - INFO - farm.data_handler.data_silo -   LOADING TEST DATA
08/17/2021 17:54:42 - INFO - farm.data_handler.data_silo -   Loading test set from: /Users/edwarddavies/Git/Trade_with_Twitter/data/fine_tune_bert/test.csv
08/17/2021 17:54:44 - INFO - farm.data_handler.data_silo -   Got ya 7 parallel workers to convert 320000 dictionaries to pytorch datasets (chunksize = 2000)..

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Preprocessing Dataset /Users/edwarddavies/Git/Trade_with_Twitter/data/fine_tune_bert/test.csv: 100%|██████████| 320000/320000 [01:07<00:00, 4718.01 Dicts/s]
08/17/2021 17:55:52 - INFO - farm.data_handler.data_silo -   
08/17/2021 17:55:52 - INFO - farm.data_handler.data_silo -   DATASETS SUMMARY
08/17/2021 17:55:59 - INFO - farm.data_handler.data_silo -   Examples in train: 1152000
08/17/2021 17:55:59 - INFO - farm.data_handler.data_silo -   Examples in dev  : 127999
08/17/2021 17:55:59 - INFO - farm.data_handler.data_silo -   Examples in test : 320000
08/17/2021 17:55:59 - INFO - farm.data_handler.data_silo -   
08/17/2021 17:55:59 - INFO - farm.data_handler.data_silo -   Longest sequence length observed after clipping:     341
08/17/2021 17:55:59 - INFO - farm.data_handler.data_silo -   Average sequence length after clipping: 24.387866319444445
08/17/2021 17:55:59 - INFO - farm.data_handler.data_silo -   Proportion clipped:      0.0


### Modeling

In [73]:
# loading the pretrained BERT base cased model
language_model = LanguageModel.load(lang_model)
# prediction head for our model that is suited for classifying news article genres
prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list))

model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

08/17/2021 17:56:01 - INFO - farm.modeling.language_model -   
08/17/2021 17:56:01 - INFO - farm.modeling.language_model -   LOADING MODEL
08/17/2021 17:56:01 - INFO - farm.modeling.language_model -   Could not find bert-base-cased locally.
08/17/2021 17:56:01 - INFO - farm.modeling.language_model -   Looking on Transformers Model Hub (in local cache and online)...
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT 

In [74]:
model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

08/17/2021 17:56:11 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 3e-05}'
08/17/2021 17:56:16 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
08/17/2021 17:56:16 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 28800.0, 'num_training_steps': 288000}'


### Training

In [75]:
trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

In [76]:
trainer.train()

08/17/2021 17:56:24 - INFO - farm.train -   
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `

Train epoch 0/1 (Cur. train loss: 0.6508):   0%|          | 16/144000 [13:05<1962:31:39, 49.07s/it]


KeyboardInterrupt: 

## Saving and Inferencing

In [None]:
save_dir = "../data/saved_models/bert"
model.save(save_dir)
processor.save(save_dir)

In [0]:
# to download the model
!zip -r saved_models/model.zip saved_models/bert-english-news-article

In [0]:
inferenced_model = Inferencer.load(save_dir)

In [0]:
def read_file(file_name: str) -> dict:
  text_file = open (file_name, 'r')
  text_file = text_file.read().replace('\n', ' ')
  return {'text': text_file}

In [0]:
def create_input(text_files:list) -> list:
  model_input = list()
  for text_file in text_files:
    model_input.append(read_file(text_file['file']))
  return model_input

In [0]:
def create_result_overview (articles:list, result:list) -> pd.DataFrame:
  files = list()
  labels = list()
  predictions = list()
  for i in range(len(articles)):
    files.append (articles[i]['file'])
    labels.append(articles[i]['genre'])
    predictions.append(result[0]['predictions'][i]['label'].strip("'[]'"))
  data = {'file': files, 'actual': labels, 'prediction': predictions}
  df = pd.DataFrame(data)
  return df

In [0]:
articles = [{'file': 'bbc_news/generated_data/inferencing/business.txt', 'genre': 'business'},
            {'file': 'bbc_news/generated_data/inferencing/sport.txt', 'genre': 'sport'}]

article_texts = create_input(articles)

result = inferenced_model.inference_from_dicts(article_texts)

df = create_result_overview(articles, result)

df.head()