# Dependencies


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4
Mon Mar  1 12:25:07 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------

In [None]:
!pip install transformers==4.2.1
!pip install optuna==2.3.0

Collecting transformers==4.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/cd/40/866cbfac4601e0f74c7303d533a9c5d4a53858bd402e08e3e294dd271f25/transformers-4.2.1-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 8.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 34.1MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/fb/36/59e4a62254c5fcb43894c6b0e9403ec6f4238cc2422a003ed2e6279a1784/tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 49.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=b2ace620d7a7

In [None]:
!mkdir data
!mkdir train

# Preparing Data

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
all_datasets= []

In [None]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [None]:
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

In [None]:
df = pd.read_csv('Sentences_AllAgree.txt', sep="@", encoding ='ISO-8859-1' , header=None)
df.columns = [DATA_COLUMN, LABEL_COLUMN]
print(df)
print(df[LABEL_COLUMN].value_counts())

train_split, test_split = train_test_split(df, test_size=0.2, random_state=42)
label_list = list(df[LABEL_COLUMN].unique())
print(label_list)

data_All_Agree = Dataset(
    "All_Agree", train_split, test_split, label_list
)

all_datasets.append(data_All_Agree)

                                                   text     label
0     According to Gran , the company has no plans t...   neutral
1     For the last quarter of 2010 , Componenta 's n...  positive
2     In the third quarter of 2010 , net sales incre...  positive
3     Operating profit rose to EUR 13.1 mn from EUR ...  positive
4     Operating profit totalled EUR 21.1 mn , up fro...  positive
...                                                 ...       ...
2259  Operating result for the 12-month period decre...  negative
2260  HELSINKI Thomson Financial - Shares in Cargote...  negative
2261  LONDON MarketWatch -- Share prices ended lower...  negative
2262  Operating profit fell to EUR 35.4 mn from EUR ...  negative
2263  Sales in Finland decreased by 10.5 % in Januar...  negative

[2264 rows x 2 columns]
neutral     1391
positive     570
negative     303
Name: label, dtype: int64
['neutral', 'positive', 'negative']


In [None]:
for x in all_datasets:
  print(x.name) 

All_Agree


# BERT compliant Dataset and model_init

In [None]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [None]:
dataset_name = 'All_Agree'
model_name = 'bert-base-uncased'
task_name = 'classification'
max_len = 256

In [None]:
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print(d.name)
    print('Dataset found')
    break

All_Agree
Dataset found


In [None]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [None]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(selected_dataset.train[DATA_COLUMN].to_list(),selected_dataset.train[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(selected_dataset.test[DATA_COLUMN].to_list(),selected_dataset.test[LABEL_COLUMN].to_list(),model_name,max_len,label_map)

{'neutral': 0, 'positive': 1, 'negative': 2}


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

The compute_metrics function can be used to compute custom metrics during training/evaluation

In [None]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[1,2])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

# Hyper Parameter Search

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.lr_scheduler_type = 'cosine'
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000
# training_args.save_steps = 
#training_args.eval_steps = 
training_args.disable_tqdm = True
# print("Logging Step:", training_args.logging_steps)
# print("Eval Step:",training_args.eval_steps)

In [None]:
steps_per_epoch = (len(selected_dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)

56
448


In [None]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=test_dataset, 
    model_init=model_init,
    compute_metrics=compute_metrics,
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

here you can define your search space.

the my_hp_space function defines the hyper parameter super set, of which you can choose a subset (or even the whole set) for the grid search

Note: You can include the opoch count as a hyperparameter, but this will drasticly increase the search space, I prefer setting a fixed epcoh size, then I manually search for the highest score between the epochs since optuna can't do that as far as I know. This should be easy as you won't be training for more than a handfull of epochs most likely, alternatively you can check out transformers.EarlyStoppingCallback in https://huggingface.co/transformers/main_classes/callback.html

In [None]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 2e-5, 7e-5, step=1e-5),
        "seed": trial.suggest_categorical("seed", [0, 1, 42, 666, 123, 12345]),
        "warmup_steps": trial.suggest_int("warmup_steps",0,total_steps*0.1,step=total_steps*0.1*0.5)
    }

search_space = {
    "learning_rate":  list(np.arange(2e-5, 7e-5, 1e-5)),
    "seed":  [0, 1, 42, 666, 123, 12345],
    "warmup_steps": list(range(0, int((total_steps)*0.1)+1, int(total_steps*0.1*0.5)))
}
search_space

{'learning_rate': [2e-05,
  3.0000000000000004e-05,
  4.000000000000001e-05,
  5.000000000000001e-05,
  6.000000000000001e-05],
 'seed': [0, 1, 42, 666, 123, 12345],
 'warmup_steps': [0, 22, 44]}

In [None]:
def my_objective(metrics):
    return metrics['eval_macro_f1']

Make sure to mount your google drive here to save to drive. Otherwise change 'storage' to a local directory

In [None]:
name = "Test 1"

best_run = trainer.hyperparameter_search(direction="maximize",
                                         hp_space=my_hp_space,
                                         compute_objective=my_objective,
                                         n_trials=None,
                                         pruner=optuna.pruners.NopPruner(),
                                         sampler=optuna.samplers.GridSampler(search_space),
                                         study_name=name,
                                         storage="sqlite:////content/drive/MyDrive/{}.db".format(name), #change this to a local directory if you want to save to disk
                                         load_if_exists=False # you can change this to true, for continuing the search
                                         )



In [None]:
best_run

The hyperparameter_search supports both optuna and Raytune https://huggingface.co/blog/ray-tune

# Regular Training

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-6
training_args.learning_rate = 2e-5
training_args.weight_decay = 0.01
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 3


steps_per_epoch = (len(selected_dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
#training_args.load_best_model_at_end = True
#training_args.metric_for_best_model = 'accuracy'
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 25
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

56
168


In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    #callbacks = [EarlyStoppingCallback(early_stopping_patience=3,early_stopping_threshold=0.001)]
)
#The warning message below is fine, it even tells you it's expected that this will happen since you are using a non-finetuned version of BERT and instantiating is
#as as BertForSequenceClassification, which has some of its own special weights that are used for classification, this is normal since this is needed to finetune
#on a downstream task, the message below even tells you so

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
1,No log,0.436834,0.559729,0.364865,0.521062,0.615154,0.818985,2.788,162.483
2,No log,0.172677,0.926936,0.898512,0.92674,0.927605,0.953642,2.8363,159.713
3,No log,0.13881,0.928077,0.900252,0.922676,0.935547,0.953642,2.8673,157.991



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



TrainOutput(global_step=171, training_loss=0.42305313913445725, metrics={'train_runtime': 102.2328, 'train_samples_per_second': 1.673, 'total_flos': 913658179355136, 'epoch': 3.0})

You can also add custom loss/training step etc by subclassing and overriding the default Trainer functions. See https://huggingface.co/transformers/main_classes/trainer.html. 

Once the model is trained we can save it along with it's corresponding tokenizer to disk.

In [None]:
trainer.save_model("TestModel")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained("TestModel")

('TestModel/tokenizer_config.json',
 'TestModel/special_tokens_map.json',
 'TestModel/vocab.txt',
 'TestModel/added_tokens.json')

Now we can load the model back in and use huggingface's pipelines to get predictions, this saves us having to write our own prediction functions. Note that you have to save the model with its tokenizer otherwise the .from_pretrained function will error since it won't find some of the files it expects to see in the directory.

In [None]:
from transformers import pipeline, AutoModel
model_name = "TestModel"
num_labels=len(label_map)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
text = "We saw a big increase in profits last year"
classifier(text)

[[{'label': 'LABEL_0', 'score': 0.030508000403642654},
  {'label': 'LABEL_1', 'score': 0.902646005153656},
  {'label': 'LABEL_2', 'score': 0.06684602797031403}]]

In [None]:
model_name = "bert-base-uncased" #This is an example of what NOT to do, don't load a BERT model that isnt finetuned or a version that is
# fine tuned on a different task to perform sentiment analysis, the warning message below should be a clear indicator that something is wrong.
#its normal to see this message during training, not while using a pipeline to get predictions

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, return_all_scores=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
text = "We saw a big increase in profits last year"
classifier(text)

[[{'label': 'NEGATIVE', 'score': 0.0019726064056158066},
  {'label': 'POSITIVE', 'score': 0.9980273842811584}]]

In [None]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
text = "We saw a big increase in profits last year"
classifier(text)

[[{'label': '1 star', 'score': 0.15750762820243835},
  {'label': '2 stars', 'score': 0.18700267374515533},
  {'label': '3 stars', 'score': 0.2065645456314087},
  {'label': '4 stars', 'score': 0.2183638960123062},
  {'label': '5 stars', 'score': 0.2305612415075302}]]

For a collection of models see https://huggingface.co/models

One proposed method to (possibly) improve training is ULMfit. To implement the ULMfit method, we need 3 things, Slanted Triangular Learning rates(SLTR), Discriminitive Finetuning and Gradual Unfreezing. I would highly recomend you look at Fastai for those implementations however they should be doable manually with HuggingFace

# SLTR

In [None]:
# This is fairly easy to implement, and is actually a function of the optimizer, however since the training.args() 
# defaults to an instance of the AdamW optimizer with a linear scheduler all we have to change in our code is 
# change the following in the training.args

training_args.lr_scheduler_type = 'cosine' #omit this line from above
training_args.lr_scheduler_type = 'linear' # use this instead

#It's not recomended to implement SLTR on it's own as it has been shown to lead to worse performance when alone, best used with the rest of ULMfit methods

# Discriminitive Finetuning

In [None]:
# This is also a function of the optimizer, you can specify an optimizer in Trainer(), this accepts native pytorch/tensorflow optimizer, the following
# is an example from the pytorch docs on how to implement this

optimizer = torch.optim.SGD([
    {'params': model.base.parameters()},
    {'params': model.classifier.parameters(), 'lr': 1e-3}
], lr=1e-2, momentum=0.9)) 

#This means that model.base’s parameters will use the default learning rate set above, 
#model.classifier’s parameters will use a learning rate of 1e-3, and a momentum of 0.9 will be used for all parameters.
#You should be able to extend this to further finetune the learning rate for each of the base.parameters
#The same works for torch.optim.Adam

# Gradual Unfreezing

In [None]:
#This is possibly the trickiest to implement as training.train() runs the whole training and evaluation process in one go
#The following does freeze the BERT layers with the exception of the classifier
for param in model.bert.bert.parameters():
    param.requires_grad = False
#Note that this works with a model initialized as BertForSequenceClassification, if using AutoModel or BERTModel this would be slightly different so please
#be wary of the specs of how your model is initialized

#I am yet to test gradual unfreezing directly in huggingface but you should be able to do the above ^, call trainer.training_step() to perform one training step
#and then unfreeze each of the BERT layers gradually from the top down, in our case we have 168 steps which we printed above in training_args. Also note that
# .training_step() takes in two inputs, model and a dictionary of text and labels


#As I said, look to fastai documentation to implement ULMfit, the docs are quite bad but fastai was developed by the authors of ULMfit it will likely perform better
#than your manual implementation of the ULMfit method, it's somewhat tricky to get it working with huggingface pretrained models but there are some examples online 
#on Kaggle etc of how to do it

As I said, look to the fastai docs to implement ULMfit, they're not the best but fastai was developed by the authors of ULMfit so will likely perform better than your manual implementation of the ULMfit method, it's somewhat tricky to get it working with HuggingFace pretrained models but there are some examples online on Kaggle etc of how to do it.

Finally, it is not a given that the ULMfit Method will lead to better performance since it was not specifically developed for BERT but rather for NLP task finetuning in general, Dogu Araci showed that it allowed for slightly better performance when training FinBERT(2-3% accuracy increase) however others have said that they did not experience a performance increase compared to a naive finetuning approach such as above. The following paper https://arxiv.org/pdf/2006.04884.pdf offers some insights on possibly why ULMfit might work for BERT and why it may not be the best method to go with and a very simple alternative finetuning approach.

My recomendation is if you're trying to maximise your models performance - try everything, naive finetuning, ULMfit and the method proposed in the paper above then see which performs best, run a hyperparameter search for each method and run each of the hyperparameter searches 2-3 times to account for any randomness during training and see what gives you the best final result in terms of validation accuracy and loss. Ofcourse depending on the size of your training data and the compute power available to you this may not be realistically doable, you have to run each hyperparameter search only once, or ommit the hyperparameter search entirely(althought I would really not recomend that second approach).

The differences will likely be small(in the range of 1-5%) which should not be surprising, given how good BERT already is at what it does(it performs close to the human benchmark, in some cases better) and the size of its original training corpus, it makes sense that it would be hard to optimize it further.