In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
v1 = load_dataset("EleutherAI/fever", "v1.0")
wikipages = load_dataset("EleutherAI/fever", "wiki_pages", split="wikipedia_pages")
v1

Found cached dataset fever (/mnt/ssd-2/hf_cache/EleutherAI___fever/v1.0/1.0.0/aaf78d795e7eb335dcf2edfd8b2eae7b6030e171263fc0205db77ccedc5ffdf6)
100%|██████████| 4/4 [00:00<00:00, 52.82it/s]
Found cached dataset fever (/mnt/ssd-2/hf_cache/EleutherAI___fever/wiki_pages/1.0.0/aaf78d795e7eb335dcf2edfd8b2eae7b6030e171263fc0205db77ccedc5ffdf6)


DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'claim', 'evidence_annotation_id', 'evidence_id', 'evidence_wiki_url', 'evidence_sentence_id'],
        num_rows: 263822
    })
    dev: Dataset({
        features: ['id', 'label', 'claim', 'evidence_annotation_id', 'evidence_id', 'evidence_wiki_url', 'evidence_sentence_id'],
        num_rows: 28625
    })
    paper_dev: Dataset({
        features: ['id', 'label', 'claim', 'evidence_annotation_id', 'evidence_id', 'evidence_wiki_url', 'evidence_sentence_id'],
        num_rows: 14475
    })
    paper_test: Dataset({
        features: ['id', 'label', 'claim', 'evidence_annotation_id', 'evidence_id', 'evidence_wiki_url', 'evidence_sentence_id'],
        num_rows: 14150
    })
})

In [3]:
wikipages

Dataset({
    features: ['id', 'text', 'lines'],
    num_rows: 5416537
})

In [4]:
import pandas as pd
subwiki = wikipages
wiki_df = pd.DataFrame(subwiki)
wiki_df.set_index('id', inplace=True)
wiki_df

Unnamed: 0_level_0,text,lines
id,Unnamed: 1_level_1,Unnamed: 2_level_1
,,
1928_in_association_football,The following are the football -LRB- soccer -R...,0\tThe following are the football -LRB- soccer...
1986_NBA_Finals,The 1986 NBA Finals was the championship round...,0\tThe 1986 NBA Finals was the championship ro...
1901_Villanova_Wildcats_football_team,The 1901 Villanova Wildcats football team repr...,0\tThe 1901 Villanova Wildcats football team r...
1992_Northwestern_Wildcats_football_team,The 1992 Northwestern Wildcats team represente...,0\tThe 1992 Northwestern Wildcats team represe...
...,...,...
Yuto_Agarie,Yuto Agarie -LRB- born 6 July 1993 -RRB- is a ...,0\tYuto Agarie -LRB- born 6 July 1993 -RRB- is...
Yume_1_Go,is the eleventh single by the Japanese rock ba...,0\tis the eleventh single by the Japanese rock...
Yada_Yada_-LRB-album-RRB-,Yada Yada is the eighth studio album by Dutch ...,0\tYada Yada is the eighth studio album by Dut...
Xylorycta_bipunctella,Xylorycta bipunctella is a moth in the Xyloryc...,0\tXylorycta bipunctella is a moth in the Xylo...


In [5]:
# join the fever data with the wiki data on the v1.evidence_wiki_url == wikipages.id using Dataset.map
import re
from datasets import DatasetDict

def convert_text(text):
    # Remove -LRB-, -LSB-, -RRB- and any other tags within -LSB- and -RSB-
    # text = re.sub(r'-LSB-(.*?)\-RSB-', '', text)

    # Replace -LRB- and -RRB- with ()
    text = text.replace('-LRB-', '(').replace('-RRB-', ')').replace('-LSB-', '[').replace('-RSB-', ']')

    # Remove spaces before punctuation marks
    text = re.sub(r"`` ", r'"', text)
    text = re.sub(r" ''", r'"', text)
    text = re.sub(r"\s+([-%)\],.!?;:'])", r'\1', text)
    text = re.sub(r"([\[($-])\s+", r'\1', text)

    # remove eveything after the first tab
    text = text.split('\t')[0]

    return text

# Example usage
text_to_convert = "Vincent Cassel -LRB- -LSB- vɛ̃sɑ̃ kasɛl -RSB- ; born Vincent Crochon , 23 November 1966 -RRB- is a French actor best known to English-speaking audiences through his film performances in Ocean 's Twelve and Ocean 's Thirteen , as well as Black Swan .	French	France	Black Swan	Black Swan (film)"
converted_text = convert_text(text_to_convert)
print(converted_text)

def get_evidence(example):
    ev_title = example["evidence_wiki_url"]
    sent_id = example["evidence_sentence_id"]

    try:
        ev = wiki_df.loc[ev_title]
    except KeyError:
        print(f"Wiki page {ev_title} not found")
        return {"ev_sentence": None, "ev_paragraph": None}
    lines = ev["lines"].split("\n")
    sents = []
    target_sent = None
    for line in lines:
        sent = convert_text(line[len(str(sent_id)):].strip())
        sents.append(sent)
        if line.startswith(str(sent_id)):
            target_sent = sent

    if target_sent is None:
        print(f"No sentence found for {ev_title} ({sent_id}))")

    paragraph = " ".join(sents)
    return {"ev_sentence": target_sent, "ev_paragraph": paragraph}

n_train, n_val, n_test = 50_000, 10_000, 14_150
subv1 = DatasetDict({
    "train": v1["train"].select(range(n_train)),
    "validation": v1["dev"].select(range(n_val)),
    "test": v1["paper_test"].select(range(n_test))
})
mappedv1 = subv1.map(get_evidence, batched=False, num_proc=4, desc="Getting evidence sentences")
mappedv1

Vincent Cassel ([vɛ̃sɑ̃ kasɛl]; born Vincent Crochon, 23 November 1966) is a French actor best known to English-speaking audiences through his film performances in Ocean's Twelve and Ocean's Thirteen, as well as Black Swan.


Loading cached processed dataset at /mnt/ssd-2/hf_cache/EleutherAI___fever/v1.0/1.0.0/aaf78d795e7eb335dcf2edfd8b2eae7b6030e171263fc0205db77ccedc5ffdf6/cache-49e24f79768154dc_*_of_00004.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/EleutherAI___fever/v1.0/1.0.0/aaf78d795e7eb335dcf2edfd8b2eae7b6030e171263fc0205db77ccedc5ffdf6/cache-e7a5f6d29a509f1c_*_of_00004.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/EleutherAI___fever/v1.0/1.0.0/aaf78d795e7eb335dcf2edfd8b2eae7b6030e171263fc0205db77ccedc5ffdf6/cache-473bf5c97fb581d2_*_of_00004.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'claim', 'evidence_annotation_id', 'evidence_id', 'evidence_wiki_url', 'evidence_sentence_id', 'ev_sentence', 'ev_paragraph'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['id', 'label', 'claim', 'evidence_annotation_id', 'evidence_id', 'evidence_wiki_url', 'evidence_sentence_id', 'ev_sentence', 'ev_paragraph'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['id', 'label', 'claim', 'evidence_annotation_id', 'evidence_id', 'evidence_wiki_url', 'evidence_sentence_id', 'ev_sentence', 'ev_paragraph'],
        num_rows: 14150
    })
})

In [6]:
[c for c in mappedv1["train"]["claim"] if not c.endswith(".")]

['A block is also called a "swat."',
 "Frank Zappa's debut was Freak Out!",
 'Audi used to have the pet "Truth in Engineering."',
 'Imagine Dragons was named the "Breakthrough Band of 2014."',
 'Michael Schumacher is a professional writer',
 'Cloud Atlas has been described as "a sort of pointillist mosaic."',
 'George Best has been described as "the worst player to ever pull on the green shirt of Wales."',
 'President of the United States Ronald Reagan publicly describe the Soviet Union as an "evil empire."',
 'Los Angeles is nicknamed the "City of Angels."',
 'Meek Mill sings the track "Young & Gettin\' It."',
 'A block is also called a "bushed."',
 'Bali is an administrative division within Indonesia',
 'Bali is an administrative division within Indonesia',
 'Bali is an administrative division within Indonesia',
 'Bali is an administrative division within Indonesia',
 'Bruno Mars leads "The Hooligans."',
 'Rolling Stone book described Led Zeppelin as "the heaviest band of all time."'

In [7]:
def remove_last_punc(text):
    # find the last puncuation mark
    # if it's within 3 characters of the end, remove everything after it
    # otherwise do nothing
    last_punc = max([text.rfind('.'), text.rfind('?'), text.rfind('!')])
    if last_punc >= len(text) - 3:
        return text[:last_punc]
    return text

def strip_claim_periods(example):
    claim = remove_last_punc(example['claim'])
    return {"claim": claim}

orig_mappedv1 = mappedv1
mappedv1 = mappedv1.map(strip_claim_periods)

                                                                    

In [8]:
mappedv1["train"][:3]

{'id': [75397, 75397, 150448],
 'label': [1, 1, 1],
 'claim': ['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company',
  'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company',
  'Roman Atwood is a content creator'],
 'evidence_annotation_id': [92206, 92206, 174271],
 'evidence_id': [104971, 104971, 187498],
 'evidence_wiki_url': ['Nikolaj_Coster-Waldau',
  'Fox_Broadcasting_Company',
  'Roman_Atwood'],
 'evidence_sentence_id': [7, -1, 1],
 'ev_sentence': ['He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam (2008), as well as appearing as Frank Pike in the 2009 Fox television film Virtuality, originally intended as a pilot.',
  None,
  'He is best known for his vlogs, where he posts updates about his life on a daily basis.'],
 'ev_paragraph': ["Nikolaj Coster-Waldau ([neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯]; born 27 July 1970) is a Danish actor, producer and screenwriter. He graduated from Danish National School of Theatre in Copenha

In [10]:
# keep only the ones that have evidence
mappedv1_filt = mappedv1.filter(lambda x: x["ev_sentence"])
mappedv1_filt = mappedv1_filt.filter(lambda x: not x["ev_sentence"].isnumeric())
mappedv1_filt["train"][:10]

Loading cached processed dataset at /mnt/ssd-2/hf_cache/EleutherAI___fever/v1.0/1.0.0/aaf78d795e7eb335dcf2edfd8b2eae7b6030e171263fc0205db77ccedc5ffdf6/cache-f8c5c0b34584fa86.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/EleutherAI___fever/v1.0/1.0.0/aaf78d795e7eb335dcf2edfd8b2eae7b6030e171263fc0205db77ccedc5ffdf6/cache-bd34b5a39f743d10.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/EleutherAI___fever/v1.0/1.0.0/aaf78d795e7eb335dcf2edfd8b2eae7b6030e171263fc0205db77ccedc5ffdf6/cache-0c153fc12a547d68.arrow
                                                                       

{'id': [75397,
  150448,
  150448,
  214861,
  33078,
  33078,
  6744,
  226034,
  76253,
  188923],
 'label': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'claim': ['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company',
  'Roman Atwood is a content creator',
  'Roman Atwood is a content creator',
  'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts',
  'The Boston Celtics play their home games at TD Garden',
  'The Boston Celtics play their home games at TD Garden',
  'The Ten Commandments is an epic film',
  'Tetris has sold millions of physical copies',
  'There is a movie called The Hunger Games',
  'Ryan Gosling has been to a country in Africa'],
 'evidence_annotation_id': [92206,
  174271,
  174271,
  255136,
  49158,
  49159,
  23513,
  269479,
  93100,
  220565],
 'evidence_id': [104971,
  187498,
  187499,
  254645,
  58489,
  58490,
  28978,
  265800,
  106007,
  226318],
 'evidenc

In [11]:
mappedv1_filt.push_to_hub("fever")

Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:02<00:00, 13.32ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:01<00:00,  1.86s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:04<00:00,  4.49s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  5.25it/s]
Pushing split validation to the Hub.
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 13.93ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  6.69it/s]
Pushing split test to the Hub.
Creating parquet from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 13.56ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  1.96it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it]
D

# glue:mnli

In [30]:
orig_mnli = load_dataset("glue", "mnli")
orig_mnli

Found cached dataset glue (/mnt/ssd-2/hf_cache/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 5/5 [00:00<00:00, 461.62it/s]


DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [31]:
from datasets import concatenate_datasets
# remove neutral examples
# change label 0 to 1, and 2 to 0
# clean the hypothesis from ending punctuation

# "There are matched dev/test sets which are derived
# from the same sources as those in the training set, 
# and mismatched sets which do not closely resemble any seen at training time."
# TODO: we might  use the union of matched and mismatched validation set for training to improve diversity
# mnli = DatasetDict({
#     'train': concatenate_datasets([orig_mnli['validation_matched'], orig_mnli['validation_mismatched']]),
#     'validation': concatenate_datasets([orig_mnli['validation_matched'], orig_mnli['validation_mismatched']])
# })
mnli = DatasetDict({
    'train': orig_mnli['train'],
    'validation': orig_mnli['validation_matched'],
    'test': orig_mnli['validation_mismatched']
})

mnli = mnli.filter(lambda x: x['label'] != 1)

def change_label(example):
    assert example['label'] in [0, 2]
    example['label'] = 1 if example['label'] == 0 else 0
    return example

def clean_hypothesis(example):
    example['hypothesis'] = remove_last_punc(example['hypothesis'])
    return example

mnli = mnli.map(change_label).map(clean_hypothesis)

Loading cached processed dataset at /mnt/ssd-2/hf_cache/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-753a9b776e1b1473.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1f99e44ad09eb0b0.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a324d18df4cca9e7.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-0c8f9a68724aaf5a.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9c78f9abbda0a05c.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-bf205754b46a4124.arrow
    

In [36]:
mnli["train"][:10]

{'premise': ['you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him',
  'One of our number will carry out your instructions minutely.',
  'How do you know? All this is their information again.',
  "my walkman broke so i'm upset now i just have to turn the stereo up real loud",
  "(Read  for Slate 's take on Jackson's findings.)",
  'Gays and lesbians.',
  "At the end of Rue des Francs-Bourgeois is what many consider to be the city's most handsome residential square, the Place des Vosges, with its stone and red brick facades.",
  'I burst through a set of cabin doors, and fell to the ground-',
  'Fun for adults and children.',
  "I don't mean to be glib about your concerns, but if I were you, I might be more concerned about the near-term rate implications of this $