In [1]:
import argparse
import numpy as np
import pandas as pd
import torch
from transformers import BertForTokenClassification, BertTokenizerFast
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from utils import trim_entity_spans, convert_goldparse, ResumeDataset, tag2idx, idx2tag, get_hyperparameters, train_and_val_model

In [2]:
parser = argparse.ArgumentParser(description='Train Bert-NER')
parser.add_argument('-e', type=int, default=5, help='number of epochs')
parser.add_argument('-o', type=str, default='.',
                    help='output path to save model state')

_StoreAction(option_strings=['-o'], dest='o', nargs=None, const=None, default='.', type=<class 'str'>, choices=None, help='output path to save model state', metavar=None)

In [3]:
args = parser.parse_args().__dict__

usage: ipykernel_launcher.py [-h] [-e E] [-o O]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\TUF\AppData\Roaming\jupyter\runtime\kernel-42f730fc-3da6-4188-acfd-64e47fe625ae.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
MAX_LEN = 500
EPOCHS = 100
MAX_GRAD_NORM = 1.0
MODEL_NAME = 'bert-base-uncased'
TOKENIZER = BertTokenizerFast('./vocab/vocab.txt', lowercase=True) ## Important section
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
data = trim_entity_spans(convert_goldparse('./data/Resumes.json'))

In [6]:
total = len(data)
train_data, val_data = data[:180], data[180:]

In [7]:
train_d = ResumeDataset(train_data, TOKENIZER, tag2idx, MAX_LEN)
val_d = ResumeDataset(val_data, TOKENIZER, tag2idx, MAX_LEN)

val_dl = DataLoader(val_d, batch_size=4)

In [8]:
train_sampler = RandomSampler(train_d)
train_dl = DataLoader(train_d, sampler=train_sampler, batch_size=8)

In [10]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tag2idx))
model.to(DEVICE)
optimizer_grouped_parameters = get_hyperparameters(model, True)
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_and_val_model(
    model,
    TOKENIZER,
    optimizer,
    EPOCHS,
    idx2tag,
    tag2idx,
    MAX_GRAD_NORM,
    DEVICE,
    train_dl,
    val_dl
)