In [2]:
import re, os, json, pickle, ast, time, random, requests
import pandas as pd
import numpy as np
import spacy
import scipy
import scipy.sparse as sp

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast, BertForQuestionAnswering

from tqdm import tqdm
tqdm.pandas()

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GLOBAL_SEED = 1

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

GLOBAL_WORKER_ID = None
def _init_fn(worker_id):
    global GLOBAL_WORKER_ID
    GLOBAL_WORKER_ID = worker_id
    set_seed(GLOBAL_SEED + worker_id)

set_seed(GLOBAL_SEED)

In [4]:
def check_equal(actual, expected):
    assert actual == expected, actual

def check_approx(actual, expected):
    assert np.allclose(actual, expected), actual

In [5]:
df_squad = pd.read_csv(
    "cleaned_squad_data.csv",
    dtype = {
        "question" : str,
        "context_paragraph" : str,
        "answer" : str,
        "answer_start" : int,
        "answer_end" : int,
        "answer_sent_index" : int,
        "tokenized_context" : str
    },
    converters = {"context_sentences" : ast.literal_eval}
)

df_squad.head(5)

Unnamed: 0,question,context_paragraph,answer,answer_start,answer_end,context_sentences,answer_sent_index
0,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269,286,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1
1,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207,226,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1
2,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526,530,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,3
3,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166,180,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1
4,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276,286,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1


Since the cell contents are truncated, let's print out and examine one row in detail:

In [6]:
df_squad.iloc[3].to_dict()

{'question': 'In what city and state did Beyonce  grow up? ',
 'context_paragraph': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'answer': 'Houston, Texas',
 'answer_start': 166,
 'answer_end': 180,
 'context_sentences': ['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, re

In [27]:
class SQuADDataset(Dataset):
    def __init__(self, df_squad):
        """
        Class constructor.
        """
        self.data = df_squad
        self.data_cols = ["question", "context_paragraph", "answer_start", "answer_end"]

    def __len__(self):
        """
        Get the dataset length.
        """
        return len(self.data)

    def __getitem__(self, index):
        """
        Get the question, context paragraph, answer start and answer end value
        at the row specified by the input index from the dataset.
        """
        return tuple(self.data.loc[index, self.data_cols])

In [28]:
class SQuADTokenizer:
    def __init__(self, tokenizer, max_len = 512):
        """
        Store the input BertTokenizer instance and the max length
        """
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __call__(self, batch):
        """
        Perform tokenization on a batch of data

        args:
            batch (Tuple[questions, contexts, answer_starts, answer_ends]):
                questions (List[str]) : a list of questions
                contexts (List[str]) : a list of context paragraphs
                answer_starts (List[int]) : a list of answer start indexes
                answer_ends (List[int]) : a list of answer end indexes

        returns:
            Tuple[encoding, token_starts, token_ends]
                encoding (dict[str, tensor]): the output of calling tokenizer on the questions and contexts
                token_starts (List[int]) : the list of indexes for the tokens that correspond to the first answer character
        """
        questions, contexts, answer_starts, answer_ends = batch
        encoding = self.tokenizer(questions, contexts, padding='longest', truncation=True, max_length=self.max_len, return_tensors='pt')

        token_starts, token_ends = [], []
        for i, (answer_start, answer_end) in enumerate(zip(torch.LongTensor(answer_starts), torch.LongTensor(answer_ends))):
          token_start = encoding.char_to_token(i, answer_start, sequence_index=1)
          if token_start is None:
              token_start = encoding.char_to_token(i, answer_start+1, sequence_index=1)
          if token_start is None:
              token_start = encoding.char_to_token(i, answer_start-1, sequence_index=1)
          if token_start is None:
              token_start = self.max_len
          token_starts.append(token_start)

          token_end = encoding.char_to_token(i, answer_end, sequence_index=1)
          if token_end is None:
              token_end = encoding.char_to_token(i, answer_end+1, sequence_index=1)
          if token_end is None:
              token_end = encoding.char_to_token(i, answer_end-1, sequence_index=1)
          if token_end is None:
              token_end = self.max_len
          token_ends.append(token_end)


        return encoding, token_starts, token_ends

In [23]:
def test_tokenizer():
    tokenizer = BertTokenizerFast.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
    batch_tokenizer = SQuADTokenizer(tokenizer)
    example_1 = (
        ['When did Beyonce start becoming popular?'],
        ['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'],
        [269],
        [286]
    )
    encoding, token_starts, token_ends = batch_tokenizer(example_1)
    check_equal(list(encoding["input_ids"].shape), [1, 174])
    check_equal(encoding["input_ids"][0][:10].numpy().tolist(), [
        101, 2043, 2106, 20773, 2707, 3352, 2759, 1029, 102, 20773
    ])
    check_equal(encoding["token_type_ids"].numpy().tolist(), [[0]*9 + [1]*165])
    check_equal(encoding["attention_mask"].numpy().tolist(), [[1] * 174])
    check_equal(token_starts, [75])
    check_equal(token_ends, [79])

    example_2 = (
        ['When did Beyonce start becoming popular?', 'What score did the writer from the Chicago Tribune give to Spectre?'],
        ['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'Critical appraisal of the film was mixed in the United States. In a lukewarm review for RogerEbert.com, Matt Zoller Seitz gave the film 2.5 stars out of 4, describing Spectre as inconsistent and unable to capitalise on its potential. Kenneth Turan, reviewing the film for Los Angeles Times, concluded that Spectre "comes off as exhausted and uninspired". Manohla Dargis of The New York Times panned the film as having "nothing surprising" and sacrificing its originality for the sake of box office returns. Forbes\' Scott Mendelson also heavily criticised the film, denouncing Spectre as "the worst 007 movie in 30 years". Darren Franich of Entertainment Weekly viewed Spectre as "an overreaction to our current blockbuster moment", aspiring "to be a serialized sequel" and proving "itself as a Saga". While noting that "[n]othing that happens in Spectre holds up to even minor logical scrutiny", he had "come not to bury Spectre, but to weirdly praise it. Because the final act of the movie is so strange, so willfully obtuse, that it deserves extra attention." In a positive review Rolling Stone, Peter Travers gave the film 3.5 stars out of 4, describing "The 24th movie about the British MI6 agent with a license to kill is party time for Bond fans, a fierce, funny, gorgeously produced valentine to the longest-running franchise in movies". Other positive reviews from Mick LaSalle from the San Francisco Chronicle, gave it a perfect 100 score, stating: “One of the great satisfactions of Spectre is that, in addition to all the stirring action, and all the timely references to a secret organization out to steal everyone’s personal information, we get to believe in Bond as a person.” Stephen Whitty from the New York Daily News, gave it an 80 grade, saying: “Craig is cruelly efficient. Dave Bautista makes a good, Oddjob-like assassin. And while Lea Seydoux doesn’t leave a huge impression as this film’s “Bond girl,” perhaps it’s because we’ve already met — far too briefly — the hypnotic Monica Bellucci, as the first real “Bond woman” since Diana Rigg.” Richard Roeper from the Chicago Sun-Times, gave it a 75 grade. He stated: “This is the 24th Bond film and it ranks solidly in the middle of the all-time rankings, which means it’s still a slick, beautifully photographed, action-packed, international thriller with a number of wonderfully, ludicrously entertaining set pieces, a sprinkling of dry wit, myriad gorgeous women and a classic psycho-villain who is clearly out of his mind but seems to like it that way.” Michael Phillips over at the Chicago Tribune, gave it a 75 grade. He stated: “For all its workmanlike devotion to out-of-control helicopters, “Spectre” works best when everyone’s on the ground, doing his or her job, driving expensive fast cars heedlessly, detonating the occasional wisecrack, enjoying themselves and their beautiful clothes.” Guy Lodge from Variety, gave it a 70 score, stating: “What’s missing is the unexpected emotional urgency of “Skyfall,” as the film sustains its predecessor’s nostalgia kick with a less sentimental bent.”'],
        [269, 2118],
        [286, 2120]
    )
    encoding, token_starts, token_ends = batch_tokenizer(example_2)
    check_equal(list(encoding["input_ids"].shape), [2, 512])
    check_equal(encoding["input_ids"][0][-10:].numpy().tolist(), [0]*10)
    check_equal(encoding["token_type_ids"].numpy().tolist()[0], [0]*9 + [1]*165 + [0]*338)
    check_equal(encoding["token_type_ids"].numpy().tolist()[1], [0]*16 + [1]*496)
    check_equal(encoding["attention_mask"][0].numpy().tolist(), [1]*174 + [0]*338)
    check_equal(token_starts, [75, 512])
    check_equal(token_ends, [79, 512])
    print("All tests passed!")

test_tokenizer()

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

All tests passed!


In [31]:
def fine_tune_bert(model, n_epochs, optimizer, dataloader, squad_tokenizer, device, verbose = False):
    """
    Fine-tune a pre-trained BERT model on the SQuAD dataset.

    args:
        model (BertForQuestionAnswering) : a pre-trained BERT model for QA tasks
        n_epochs (int) : the number of epochs to train
        dataloader (DataLoader) : a data loader that provides access to one batch of data at a time
        squad_tokenizer (SQuADTokenizer) : a tokenizer instance to be called on every batch of data from dataloader
        device (torch.device) : the device (CPU or Cuda) that the model and data should be moved to
        verbose (bool) : a flag that indicates whether debug messages should be printed out

    return:
        model (BertForQuestionAnswering) : the fine-tuned model
        avg_loss (float) : the average training loss across epochs
    """
    model.to(device)
    total_loss = 0.0

    for epoch in tqdm(range(n_epochs)):
      epoch_loss = 0.0
      for batch in dataloader:
        optimizer.zero_grad()
        encoding, start_positions, end_positions = squad_tokenizer(batch)
        encoding = {k: v.to(device) for k,v in encoding.items()}
        start_positions = torch.tensor(start_positions).to(device)
        end_positions = torch.tensor(end_positions).to(device)

        loss = model(**encoding, start_positions=start_positions, end_positions=end_positions).loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

      total_loss += epoch_loss
      if verbose:
        print(f'Epoch {epoch+1} Loss: {epoch_loss/len(dataloader)}')

    return model, total_loss / n_epochs

In [30]:
def test_fine_tune_bert():
    tokenizer = BertTokenizerFast.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
    squad_tokenizer = SQuADTokenizer(tokenizer)

    """Train on 8 data points"""
    train_dataset = SQuADDataset(df_squad.head(8)[["question", "context_paragraph", "answer_start", "answer_end"]])
    squad_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=False, num_workers=6, worker_init_fn=_init_fn)
    model = BertForQuestionAnswering.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
    optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    model, avg_train_loss = fine_tune_bert(model, 1, optimizer, squad_dataloader, squad_tokenizer, device)
    assert avg_train_loss < 5.0, avg_train_loss

    """Train on 100 data points"""
    train_dataset = SQuADDataset(df_squad.head(100)[["question", "context_paragraph", "answer_start", "answer_end"]])
    squad_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=False, num_workers=6, worker_init_fn=_init_fn)
    model = BertForQuestionAnswering.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
    optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    model, avg_train_loss = fine_tune_bert(model, 1, optimizer, squad_dataloader, squad_tokenizer, device)
    assert avg_train_loss < 38, avg_train_loss
    print("All tests passed!")

%time test_fine_tune_bert()

100%|██████████| 1/1 [00:16<00:00, 16.88s/it]
100%|██████████| 1/1 [00:10<00:00, 10.99s/it]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [32]:
model = BertForQuestionAnswering.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
tokenizer = BertTokenizerFast.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
train_indexes, test_indexes = train_test_split(df_squad.index, train_size = 0.8, random_state = 0)
df_squad_train = df_squad.loc[train_indexes, ["question", "context_paragraph", "answer_start", "answer_end"]].reset_index()
df_squad_test = df_squad.loc[test_indexes].reset_index()

train_dataset = SQuADDataset(df_squad_train)
train_dataloader = DataLoader(train_dataset, batch_size=6, shuffle=False, num_workers=6, worker_init_fn=_init_fn, pin_memory = True)
optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
squad_tokenizer = SQuADTokenizer(tokenizer)

%time tuned_model, avg_train_loss = fine_tune_bert(model, 1, optimizer, train_dataloader, squad_tokenizer, device)

100%|██████████| 1/1 [2:12:49<00:00, 7969.26s/it]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [33]:
tuned_model.cpu().save_pretrained("bert_fine_tuned_squad")

In [16]:
def get_bert_prediction(questions, contexts, device, model=None, tokenizer=None):
    '''
    Given a list of questions and a list of corresponding contexts, predict the answers using BERT.

    args:
        questions (List[string]): list of questions to be answered
        contexts (List[string]): list of context paragraphs, each for answering a question in the input questions
        device (torch.device) : the device (CPU or Cuda) that the model and data should be moved to
        model (BertForQuestionAnswering): BERT model to be used for question answering
            or None - if None, `bertserini-bert-base-squad` will be loaded
        tokenizer (BertTokenizerFast object): tokenizer to be used for encoding questions and contexts
            or None - if None, `bertserini-bert-base-squad` will be loaded
    return:
        outputs (List[string]): list of generated answers
    '''

    if model is None:
        model = BertForQuestionAnswering.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
    if tokenizer is None:
        tokenizer = BertTokenizerFast.from_pretrained('rsvp-ai/bertserini-bert-base-squad')

    model.to(device)
    model.eval()

    outputs = []

    for question, context in tqdm(zip(questions, contexts), total=len(questions)):

        encoded_seq = tokenizer(question, context, padding="longest", truncation=True, max_length=512)

        tokens = tokenizer.convert_ids_to_tokens(encoded_seq["input_ids"])

        input_ids = torch.LongTensor([encoded_seq["input_ids"]]).to(device)
        token_type_ids = torch.LongTensor([encoded_seq["token_type_ids"]]).to(device)
        attention_mask = torch.FloatTensor([encoded_seq["attention_mask"]]).to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids)
        logits_start, logits_end = output['start_logits'], output['end_logits']
        token_start = torch.argmax(logits_start)
        token_end = torch.argmax(logits_end)

        outputs.append(tokenizer.convert_tokens_to_string(tokens[token_start:token_end]))
    return outputs

In [17]:
print("question:\n{}\ncontext:\n{}\nanswer:\n{}\n".format(
    df_squad.loc[200, "question"],
    df_squad.loc[200, "context_paragraph"],
    df_squad.loc[200, "answer"]
))

print("Bert prediction:", get_bert_prediction(
    df_squad.loc[[200], "question"],
    df_squad.loc[[200], "context_paragraph"],
    device,
    model = tuned_model
))

question:
How many awards was Beyonce nominated for at the 52nd Grammy Awards?
context:
At the 52nd Annual Grammy Awards, Beyoncé received ten nominations, including Album of the Year for I Am... Sasha Fierce, Record of the Year for "Halo", and Song of the Year for "Single Ladies (Put a Ring on It)", among others. She tied with Lauryn Hill for most Grammy nominations in a single year by a female artist. In 2010, Beyoncé was featured on Lady Gaga's single "Telephone" and its music video. The song topped the US Pop Songs chart, becoming the sixth number-one for both Beyoncé and Gaga, tying them with Mariah Carey for most number-ones since the Nielsen Top 40 airplay chart launched in 1992. "Telephone" received a Grammy Award nomination for Best Pop Collaboration with Vocals.
answer:
ten



100%|██████████| 1/1 [00:00<00:00, 39.00it/s]

Bert prediction: ['ten']





In [18]:
pretrained_test_prediction = get_bert_prediction(
    df_squad_test["question"], df_squad_test["context_paragraph"],
    device, BertForQuestionAnswering.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
)
print( (np.array(pretrained_test_prediction) == df_squad_test["answer"].str.lower().values ).mean() )

100%|██████████| 17365/17365 [04:53<00:00, 59.21it/s]


0.008407716671465592


In [19]:
tuned_test_predictions = get_bert_prediction(
    df_squad_test["question"], df_squad_test["context_paragraph"],
    device, tuned_model
)
print( (np.array(tuned_test_predictions) == df_squad_test["answer"].str.lower().values ).mean() )

100%|██████████| 17365/17365 [04:52<00:00, 59.46it/s]


0.5822055859487475


In [14]:
import azureml.core
from azureml.core.workspace import Workspace
from azureml.core.model import Model
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig
from azureml.core.webservice import Webservice
from azureml.core.environment import Environment
from azureml.core.webservice import LocalWebservice
from azureml.core.conda_dependencies import CondaDependencies

In [None]:
# your code here
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

In [39]:
# your code here
register_qa_finetuned_model = Model.register(
    workspace = ws, model_path = "./bert_fine_tuned_squad",
    model_name = "bert_fine_tuned",
    description = "PyTorch Finetuned BERT SQuAD model"
)

Registering model bert_fine_tuned


**Create scoring script and API endpoint**


```py
{
    "questions" : [
        "How many parameters does BERT-large have?",
        "When did Beyonce start becoming popular?"
    ],

    "context_paragraphs" : [
        "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance.",
        'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'
    ]
}
```
Here `questions` is a list of `N` questions and `context_paragraphs` is a list of `N` context paragraphs. For each pair of question and context paragraph, you should tokenize and input them to the fine-tuned Bert model, and return a dictionary with the following format:
```py
{
    'predicted_ans' : ['340', 'late 1990s']
}
```
where the key `predicted_ans` maps to a list of `N` strings, with each string being the predicted answer for one input pair of question and context paragraph.

In [40]:
%%writefile score.py
import os, json, pickle
import numpy as np
import torch
from tqdm import tqdm
from transformers import BertTokenizerFast
from transformers import BertForQuestionAnswering
from azureml.core.model import Model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def init():
    """
    Load the fine-tuned Bert model from file and store it in a global variable.
    Also initialize the pre-trained tokenizer.
    """
    model_path = Model.get_model_path("bert_fine_tuned")
    
    global qa_model, tokenizer
    qa_model = BertForQuestionAnswering.from_pretrained(model_path)
    tokenizer = BertTokenizerFast.from_pretrained('rsvp-ai/bertserini-bert-base-squad')

def run(input_data):
    """
    Convert the input data from string to JSON, extract the questions and contexts,
    then perform inference with Bert and return the specified JSOn response
    """
    data = json.loads(input_data)
    questions = data["questions"]
    contexts = data["context_paragraphs"]

    return {
        'predicted_ans' : get_bert_prediction(questions, contexts, device, qa_model, tokenizer)
    }

def get_bert_prediction(questions, contexts, device, model=None, tokenizer=None):
    if model is None:
        model = BertForQuestionAnswering.from_pretrained('rsvp-ai/bertserini-bert-base-squad')
    if tokenizer is None:
        tokenizer = BertTokenizerFast.from_pretrained('rsvp-ai/bertserini-bert-base-squad')

    model.to(device)
    model.eval()

    outputs = []

    for question, context in tqdm(zip(questions, contexts), total=len(questions)):

        encoded_seq = tokenizer(question, context, padding="longest", truncation=True, max_length=512)

        tokens = tokenizer.convert_ids_to_tokens(encoded_seq["input_ids"])

        input_ids = torch.LongTensor([encoded_seq["input_ids"]]).to(device)
        token_type_ids = torch.LongTensor([encoded_seq["token_type_ids"]]).to(device)
        attention_mask = torch.FloatTensor([encoded_seq["attention_mask"]]).to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids)
        logits_start, logits_end = output['start_logits'], output['end_logits']
        token_start = torch.argmax(logits_start)
        token_end = torch.argmax(logits_end)

        outputs.append(tokenizer.convert_tokens_to_string(tokens[token_start:token_end]))
    return outputs

Overwriting score.py


In [41]:
!cat score.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
import os, json, pickle
import numpy as np
import torch
from tqdm import tqdm
from transformers import BertTokenizerFast
from transformers import BertForQuestionAnswering
from azureml.core.model import Model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def init():
    """
    Load the fine-tuned Bert model from file and store it in a global variable.
    Also initialize the pre-trained tokenizer.
    """
    model_path = Model.get_model_path("bert_fine_tuned")
    
    global qa_model, tokenizer
    qa_model = BertForQuestionAnswering.from_pretrained(model_path)
    tokenizer = BertTokenizerFast.from_pretrained('rsvp-ai/bertserini-bert-base-squad')

def run(inp

In [42]:
# your code here
environment_file = CondaDependencies.create(pip_packages=[
    'azureml-defaults', 'torch==2.0.1', 'transformers==4.25.0', 'numpy'
])

with open("myenv.yml","w") as f:
    f.write(environment_file.serialize_to_string())

print(environment_file.serialize_to_string())

# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.8 and later.
- python=3.8.13

- pip:
  - azureml-defaults~=1.52.0
  - torch==2.0.1
  - transformers==4.25.0
  - numpy
channels:
- anaconda
- conda-forge



In [43]:
# your code here
# create environment variable
myenv = Environment.from_conda_specification(name = "myenv", file_path = "myenv.yml")
myenv.register(workspace = ws)

# provide the scoring script and environment variable to InferenceConfig
inference_config = InferenceConfig(source_directory = '.', entry_script = "score.py", environment = myenv)

# set deployment destination to localhost:8891
local_deployment_config = LocalWebservice.deploy_configuration(port = 8891)

# create a service instance
pytorch_local_service = Model.deploy(
    ws, "bertqa-local-service",
    [register_qa_finetuned_model], inference_config, 
    local_deployment_config
)
pytorch_local_service.wait_for_deployment(True)

To leverage new model deployment capabilities, AzureML recommends using CLI/SDK v2 to deploy models as online endpoint, 
please refer to respective documentations 
https://docs.microsoft.com/azure/machine-learning/how-to-deploy-managed-online-endpoints /
https://docs.microsoft.com/azure/machine-learning/how-to-attach-kubernetes-anywhere 
For more information on migration, see https://aka.ms/acimoemigration 
  pytorch_local_service = Model.deploy(


Downloading model bert_fine_tuned:3 to /tmp/azureml_9atv0043/bert_fine_tuned/3
Generating Docker build context.
Package creation Succeeded
Logging into Docker registry 3a4afbcfcde3410a8c01b0cb49834b91.azurecr.io
Logging into Docker registry 3a4afbcfcde3410a8c01b0cb49834b91.azurecr.io
Building Docker image from Dockerfile...
Step 1/5 : FROM 3a4afbcfcde3410a8c01b0cb49834b91.azurecr.io/azureml/azureml_dd46aeee7ee79528c366831c872e31d2
 ---> 5146892678c5
Step 2/5 : COPY azureml-app /var/azureml-app
 ---> 06c6a7528a24
Step 3/5 : RUN mkdir -p '/var/azureml-app' && echo eyJhY2NvdW50Q29udGV4dCI6eyJzdWJzY3JpcHRpb25JZCI6Ijk5ODI5MjZkLTRhZjMtNDZkMC1hMmE3LThlMzRiNjNiNTRjNyIsInJlc291cmNlR3JvdXBOYW1lIjoicDdfY211X2ZjZHMiLCJhY2NvdW50TmFtZSI6InByb2plY3Q3Iiwid29ya3NwYWNlSWQiOiIzYTRhZmJjZi1jZGUzLTQxMGEtOGMwMS1iMGNiNDk4MzRiOTEifSwibW9kZWxzIjp7fSwibW9kZWxzSW5mbyI6e319 | base64 --decode > /var/azureml-app/model_config_map.json
 ---> Running in 165505ee9be7
 ---> a683280efe7b
Step 4/5 : RUN mv '/var/azureml-ap

In [45]:
input_json = json.dumps({
    "questions" : [
        "How many parameters does BERT-large have?",
        "When did Beyonce start becoming popular?"
    ],

    "context_paragraphs" : [
        "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance.",
        'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'
    ]
})

input_json = bytes(input_json, encoding = "utf8")
output = pytorch_local_service.run(input_json)
print(output)

{'predicted_ans': ['340m', 'late 1990s']}


**Deploy to ACI container**

In [46]:
# your code here
aci_deployment_config = AciWebservice.deploy_configuration(
    cpu_cores=3.8, memory_gb=15, description='Public endpoint for PyTorch BERT qa model',
    tags={'data' : 'question and context for QA', 'method' : 'bert', 'framework' : 'pytorch'},
)

# create a new service instance
pytorch_aci_service = Model.deploy(
    ws, "pytorch-aci-service",
    [register_qa_finetuned_model], inference_config, 
    aci_deployment_config
)

pytorch_aci_service.wait_for_deployment(True)

To leverage new model deployment capabilities, AzureML recommends using CLI/SDK v2 to deploy models as online endpoint, 
please refer to respective documentations 
https://docs.microsoft.com/azure/machine-learning/how-to-deploy-managed-online-endpoints /
https://docs.microsoft.com/azure/machine-learning/how-to-attach-kubernetes-anywhere 
For more information on migration, see https://aka.ms/acimoemigration 
  pytorch_aci_service = Model.deploy(


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2023-08-05 21:39:25+00:00 Creating Container Registry if not exists.
2023-08-05 21:39:25+00:00 Registering the environment.
2023-08-05 21:39:25+00:00 Use the existing image.
2023-08-05 21:39:25+00:00 Generating deployment configuration.
2023-08-05 21:39:26+00:00 Submitting deployment to compute.
2023-08-05 21:39:31+00:00 Checking the status of deployment pytorch-aci-service..
2023-08-05 21:42:47+00:00 Checking the status of inference endpoint pytorch-aci-service.
Succeeded
ACI service creation operation finished, operation 

**Test the public service**

In [47]:
deployed_uri = pytorch_aci_service.scoring_uri
print(deployed_uri)

input_json = json.dumps({
    "questions" : [
        "How many parameters does BERT-large have?",
        "When did Beyonce start becoming popular?"
    ],

    "context_paragraphs" : [
        "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance.",
        'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'
    ]
})

response = requests.post(deployed_uri, input_json, headers = {'Content-Type' : 'application/json'})
print(response.status_code)
print(json.loads(response.content))

http://166144fe-28e0-41f8-ab08-2a329a967d92.eastus.azurecontainer.io/score
200
{'predicted_ans': ['340m', 'late 1990s']}


### Deploying Bert model

In [48]:
def test_deploy_bert():
    df_test_deploy = df_squad.sample(100, random_state = 100)
    input_json = json.dumps({
        "questions" : df_test_deploy["question"].tolist(),
        "context_paragraphs" : df_test_deploy["context_paragraph"].tolist()
    })
    response = requests.post(deployed_uri, input_json, headers = {'Content-Type' : 'application/json'})
    check_equal(response.status_code, 200)

    predicted_ans = np.array(json.loads(response.content)["predicted_ans"])
    ans = df_test_deploy["answer"].str.lower().values
    accuracy = (predicted_ans == ans).mean()
    assert accuracy >= 0.3, accuracy
    print("All tests passed!")

test_deploy_bert()

All tests passed!


In [49]:
with open('scoring_uri.txt', 'w') as f:
    f.write(deployed_uri)
    print("Saved scoring_uri to file!")

Saved scoring_uri to file!
