# Convert MaskedLM model to a SequenceClassification model

In [2]:
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification
from transformers import AutoTokenizer
import os

In [3]:
local_model_base_dir = '../../local_models/'

In [4]:
MaskedLM_model_path = local_model_base_dir + 'bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained'
SeqClassification_model_path = local_model_base_dir + 'bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained_SC'

In [4]:
if os.path.exists(SeqClassification_model_path):
    err = f"Error! Directory {SeqClassification_model_path} already exists!"
    raise Exception(err)

## Load as Masked LM and save as SequenceClassification model

In [5]:
print(f"Going to load the MaskedLM model from {MaskedLM_model_path} and convert it into a SequenceClassification \
model and save it at {SeqClassification_model_path}")

Going to load the MaskedLM model from ../../local_models/bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained and convert it into a SequenceClassification model and save it at ../../local_models/bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained_SC


In [6]:
# Load the models
model = AutoModelForSequenceClassification.from_pretrained(MaskedLM_model_path)
tokenizer = AutoTokenizer.from_pretrained(MaskedLM_model_path, use_fast=False, truncation=True)

Some weights of the model checkpoint at ../../local_models/bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized f

In [7]:
# Save the models
model.save_pretrained(SeqClassification_model_path)
tokenizer.save_pretrained(SeqClassification_model_path)
print(f"Saved the SequenceClassification model at {SeqClassification_model_path}")

Saved the SequenceClassification model at ../../local_models/bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained_SC


## Test the tokenzer

In [20]:
# Test for MWE single-tokens
test_tokens = ["IDoffthebeatentrackID", "IDchapterandverseID", "IDjointheclubID", "IDtietheknotID"]

for token in test_tokens:
    assert tokenizer.tokenize(f'This is a {token}')[-1] == token.lower()
    test_sent = f'This is a {token}'
    print("Tokenized:", tokenizer.tokenize(test_sent))
    print("Encoded:", tokenizer.encode(test_sent.lower()))
    print()

print('\n')
print(f'SUCCESS!! The {SeqClassification_model_path} model has been updated with new tokens!!')

Tokenized: ['this', 'is', 'a', 'idoffthebeatentrackid']
Encoded: [101, 2023, 2003, 1037, 30522, 102]

Tokenized: ['this', 'is', 'a', 'idchapterandverseid']
Encoded: [101, 2023, 2003, 1037, 30530, 102]

Tokenized: ['this', 'is', 'a', 'idjointheclubid']
Encoded: [101, 2023, 2003, 1037, 30825, 102]

Tokenized: ['this', 'is', 'a', 'idtietheknotid']
Encoded: [101, 2023, 2003, 1037, 31366, 102]



SUCCESS!! The ../../local_models/bert-base-uncased_MaskedLM_STR_option1_3B1_pretrained_SC model has been updated with new tokens!!
