Source: https://github.com/huggingface/notebooks/blob/master/examples/question_answering-tf.ipynb

In [1]:
import transformers

In [2]:
# This flag is the difference between SQUAD v1 or 2 (if you're using another dataset, it indicates if impossible
# answers are allowed or not).
squad_v2 = False
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [3]:
from datasets import load_dataset, load_metric

In [4]:
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

Reusing dataset squad (/Users/claudiufilip/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 281.97it/s]


In [5]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [6]:
datasets["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

# Changes in my own dataset
1. We will need an id, a title, a context, a question and an answer.
The answer is indicated by it's starting position.

In [7]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_examples(dataset, num_examples=10):
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks: # so that we don't have the same element twice
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(
                lambda x: [typ.feature.names[i] for i in x]
            )
    display(HTML(df.to_html()))

In [8]:
show_random_examples(datasets['train'])

Unnamed: 0,id,title,context,question,answers
0,5734346a4776f41900661a70,"Tucson,_Arizona","At the University of Arizona, where records have been kept since 1894, the record maximum temperature was 115 °F (46 °C) on June 19, 1960, and July 28, 1995, and the record minimum temperature was 6 °F (−14 °C) on January 7, 1913. There are an average of 150.1 days annually with highs of 90 °F (32 °C) or higher and an average of 26.4 days with lows reaching or below the freezing mark. Average annual precipitation is 11.15 in (283 mm). There is an average of 49 days with measurable precipitation. The wettest year was 1905 with 24.17 in (614 mm) and the driest year was 1924 with 5.07 in (129 mm). The most precipitation in one month was 7.56 in (192 mm) in July 1984. The most precipitation in 24 hours was 4.16 in (106 mm) on October 1, 1983. Annual snowfall averages 0.7 in (1.8 cm). The most snow in one year was 7.2 in (18 cm) in 1987. The most snow in one month was 6.0 in (15 cm) in January 1898 and March 1922.",When was Tucson's record low?,"{'text': ['January 7, 1913'], 'answer_start': [214]}"
1,56e4c33039bdeb14003479bf,Architecture,"Architects such as Mies van der Rohe, Philip Johnson and Marcel Breuer worked to create beauty based on the inherent qualities of building materials and modern construction techniques, trading traditional historic forms for simplified geometric forms, celebrating the new means and methods made possible by the Industrial Revolution, including steel-frame construction, which gave birth to high-rise superstructures. By mid-century, Modernism had morphed into the International Style, an aesthetic epitomized in many ways by the Twin Towers of New York's World Trade Center designed by Minoru Yamasaki.",What is an example of International Style?,"{'text': ['Twin Towers'], 'answer_start': [529]}"
2,56ceeb94aab44d1400b88cb4,New_York_City,"The city and surrounding area suffered the bulk of the economic damage and largest loss of human life in the aftermath of the September 11, 2001 attacks when 10 of the 19 terrorists associated with Al-Qaeda piloted American Airlines Flight 11 into the North Tower of the World Trade Center and United Airlines Flight 175 into the South Tower of the World Trade Center, and later destroyed them, killing 2,192 civilians, 343 firefighters, and 71 law enforcement officers who were in the towers and in the surrounding area. The rebuilding of the area, has created a new One World Trade Center, and a 9/11 memorial and museum along with other new buildings and infrastructure. The World Trade Center PATH station, which opened on July 19, 1909 as the Hudson Terminal, was also destroyed in the attack. A temporary station was built and opened on November 23, 2003. A permanent station, the World Trade Center Transportation Hub, is currently under construction. The new One World Trade Center is the tallest skyscraper in the Western Hemisphere and the fourth-tallest building in the world by pinnacle height, with its spire reaching a symbolic 1,776 feet (541.3 m) in reference to the year of American independence.",On what date did the World Trade Center PATH begin operation?,"{'text': ['July 19, 1909'], 'answer_start': [727]}"
3,570e4f8e0b85d914000d7df9,Melbourne,"An influx of interstate and overseas migrants, particularly Irish, German and Chinese, saw the development of slums including a temporary ""tent city"" established on the southern banks of the Yarra. Chinese migrants founded the Melbourne Chinatown in 1851, which remains the longest continuous Chinese settlement in the Western World. In the aftermath of the Eureka Stockade, mass public support for the plight of the miners resulted in major political changes to the colony, including changes to working conditions across local industries including mining, agriculture and manufacturing. The nationalities involved in the Eureka revolt and Burke and Wills expedition gave an indication of immigration flows in the second half of the nineteenth century.",What term is given to the slums established on the souther banks of the Yarra?,"{'text': ['tent city'], 'answer_start': [139]}"
4,57268127f1498d1400e8e206,Queen_(band),"In late 1975, Queen recorded and released A Night at the Opera, taking its name from the popular Marx Brothers movie. At the time, it was the most expensive album ever produced. Like its predecessor, the album features diverse musical styles and experimentation with stereo sound. In ""The Prophet's Song"", an eight-minute epic, the middle section is a canon, with simple phrases layered to create a full-choral sound. The Mercury penned ballad, ""Love of My Life"", featured a harp and overdubbed vocal harmonies. The album was very successful in Britain, and went triple platinum in the United States. The British public voted it the 13th greatest album of all time in a 2004 Channel 4 poll. It has also ranked highly in international polls; in a worldwide Guinness poll, it was voted the 19th greatest of all time, while an ABC poll saw the Australian public vote it the 28th greatest of all time. A Night at the Opera has frequently appeared in ""greatest albums"" lists reflecting the opinions of critics. Among other accolades, it was ranked number 16 in Q Magazine's ""The 50 Best British Albums Ever"" in 2004, and number 11 in Rolling Stone's ""The 100 Greatest Albums of All Time"" as featured in their Mexican edition in 2004. It was also placed at No. 230 on Rolling Stone magazine's list of ""The 500 Greatest Albums of All Time"" in 2003. A Night at the Opera is the third and final Queen album to be featured in the book 1001 Albums You Must Hear Before You Die.",How long was Queen's The Prophet's Song?,"{'text': ['eight-minute'], 'answer_start': [309]}"
5,57273a5e708984140094db17,Affirmative_action_in_the_United_States,"Ricci v. DeStefano was heard by the United States Supreme Court in 2009. The case concerns White and Hispanic firefighters in New Haven, Connecticut, who upon passing their test for promotions to management were denied the promotions, allegedly because of a discriminatory or at least questionable test. The test gave 17 whites and two Hispanics the possibility of immediate promotion. Although 23% of those taking the test were African American, none scored high enough to qualify. Because of the possibility the tests were biased in violation of Title VII of the Civil Rights Act, no candidates were promoted pending outcome of the controversy. In a split 5-4 vote, the Supreme Court ruled that New Haven had engaged in impermissible racial discrimination against the White and Hispanic majority.",How many whites were given the possibility of immediate promotion as a result of taking the controversial test?,"{'text': ['17'], 'answer_start': [318]}"
6,57279800708984140094e1bc,Textual_criticism,"Whereas Greg had limited his illustrative examples to English Renaissance drama, where his expertise lay, Bowers argued that the rationale was ""the most workable editorial principle yet contrived to produce a critical text that is authoritative in the maximum of its details whether the author be Shakespeare, Dryden, Fielding, Nathaniel Hawthorne, or Stephen Crane. The principle is sound without regard for the literary period."" For works where an author's manuscript survived – a case Greg had not considered – Bowers concluded that the manuscript should generally serve as copy-text. Citing the example of Nathaniel Hawthorne, he noted:",What is the main criticism of Greg's work?,"{'text': ['works where an author's manuscript survived'], 'answer_start': [435]}"
7,56daebe7e7c41114004b4b15,American_Idol,"American Idol is an American singing competition series created by Simon Fuller and produced by 19 Entertainment, and is distributed by FremantleMedia North America. It began airing on Fox on June 11, 2002, as an addition to the Idols format based on the British series Pop Idol and has since become one of the most successful shows in the history of American television. The concept of the series is to find new solo recording artists, with the winner being determined by the viewers in America. Winners chosen by viewers through telephone, Internet, and SMS text voting were Kelly Clarkson, Ruben Studdard, Fantasia Barrino, Carrie Underwood, Taylor Hicks, Jordin Sparks, David Cook, Kris Allen, Lee DeWyze, Scotty McCreery, Phillip Phillips, Candice Glover, Caleb Johnson, and Nick Fradiani.",Who created American Idol?,"{'text': ['Simon Fuller'], 'answer_start': [67]}"
8,57277db4dd62a815002e9e96,Carnival,"Other regions host festivities of smaller extent, focused on the reenactment of traditional carnevalic customs, such as Tyrnavos (Thessaly), Kozani (West Macedonia), Rethymno (Crete) and in Xanthi (East Macedonia and Thrace). Tyrnavos holds an annual Phallus festival, a traditional ""phallkloric"" event in which giant, gaudily painted effigies of phalluses made of papier maché are paraded, and which women are asked to touch or kiss. Their reward for so doing is a shot of the famous local tsipouro alcohol spirit. Every year, from 1 to 8 January, mostly in regions of Western Macedonia, Carnival fiestas and festivals erupt. The best known is the Kastorian Carnival or ""Ragoutsaria"" (Gr. ""Ραγκουτσάρια"") [tags: Kastoria, Kastorian Carnival, Ragoutsaria, Ραγκουτσαρια, Καστοριά]. It takes place from 6 to 8 January with mass participation serenaded by brass bands, pipises, Macedonian and grand casa drums. It is an ancient celebration of nature's rebirth (fiestas for Dionysus (Dionysia) and Kronos (Saturnalia)), which ends the third day in a dance in the medieval square Ntoltso where the bands play at the same time.",In what square do the bands all play at once?,"{'text': ['Ntoltso'], 'answer_start': [1075]}"
9,56e7a08700c9c71400d77413,Arena_Football_League,"Following the suspension of the AFL's 2009 season, league officials and owners of af2 (which had played its season as scheduled) began discussing the future of arena football and the two leagues. With its 50.1 percent ownership of af2, the AFL's bankruptcy and dissolution prompted the dissolution of af2 as well. That league was formally considered disbanded on September 8, 2009, when no owner committed his or her team to the league's eleventh season by that deadline. For legal reasons, af2 league officials and owners agreed to form a new legal entity, Arena Football 1 (AF1), with former AFL teams the Arizona Rattlers and Orlando Predators joining the former af2.",What percent of af2 was owned by the AFL?,"{'text': ['50.1'], 'answer_start': [205]}"


# Preprocessing

This is kind of bad news because it means we can't really use our own dataset with this. But we'll cross that bridge when we get there. Also,

## Tokenizer

We need a specific tokenizer that matches our models' architecture.

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # this will change

In [10]:
import transformers

assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [11]:
tokenizer("What is your name?", "My name is Sylvain.")

{'input_ids': [101, 2054, 2003, 2115, 2171, 1029, 102, 2026, 2171, 2003, 25353, 22144, 2378, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
max_length = 384  # The maximum length of a feature (question and context)
doc_stride = 128  # The allowed overlap between two part of the context when splitting is performed.

In [13]:
pad_on_right = tokenizer.padding_side == "right"

In [14]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [15]:
features = prepare_train_features(datasets["train"][:5])

In [16]:
tokenized_datasets = datasets.map(
    prepare_train_features, batched=True, remove_columns=datasets["train"].column_names
)

Loading cached processed dataset at /Users/claudiufilip/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-54ef4baac8f990c5.arrow
Loading cached processed dataset at /Users/claudiufilip/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-5e93a0902a7408b5.arrow


In [17]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

RuntimeError: Failed to import transformers.models.distilbert.modeling_tf_distilbert because of the following error (look up to see its traceback):
module 'keras.engine.base_layer' has no attribute 'BaseRandomLayer'