In [1]:
import pandas as pd

### Explore:

In [5]:
train_df = pd.read_csv("data/raw/train.csv")

In [6]:
train_df.head()

Unnamed: 0,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects
0,15661,"""The money the Clinton Foundation took from fr...","April 26, 2015","""Gingrich said the Clinton Foundation """"took m...",Katie Sanders,"""Hillary Clinton is in the political crosshair...",https://www.wsj.com/articles/clinton-foundatio...,0,"Foreign Policy, PunditFact, Newt Gingrich,"
1,9893,Annual Mammograms May Have More False-Positives,"October 18, 2011",This article reports on the results of a study...,,While the financial costs of screening mammogr...,,1,"Screening,WebMD,women's health"
2,11358,SBRT Offers Prostate Cancer Patients High Canc...,"September 28, 2016",This news release describes five-year outcomes...,"Mary Chris Jaklevic,Steven J. Atlas, MD, MPH,K...",The news release quotes lead researcher Robert...,https://www.healthnewsreview.org/wp-content/up...,1,"Association/Society news release,Cancer"
3,10166,"Study: Vaccine for Breast, Ovarian Cancer Has ...","November 8, 2011","While the story does many things well, the ove...",,"The story does discuss costs, but the framing ...",http://clinicaltrials.gov/ct2/results?term=can...,2,"Cancer,WebMD,women's health"
4,11276,Some appendicitis cases may not require ’emerg...,"September 20, 2010",We really don’t understand why only a handful ...,,"""Although the story didn’t cite the cost of ap...",,2,


In [None]:
import os
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer


def preprocess_dataset(input_dir: str, output_dir: str, model_name: str, max_length: int) -> None:

    print(f"Loading raw dataset from {input_dir}.")
    dataset = DatasetDict({
        "train": load_dataset("csv", data_files=os.path.join(input_dir, "train.csv"))["train"],
        "validation": load_dataset("csv", data_files=os.path.join(input_dir, "validation.csv"))["train"],
        "test": load_dataset("csv", data_files=os.path.join(input_dir, "test.csv"))["train"]
    })

    print(f"Initializing tokenizer for {model_name}.")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    
    def preprocess_function(examples):
        inputs = [
            (claim if claim else "") + " " + (explanation if explanation else "")
            for claim, explanation in zip(examples["claim"], examples["explanation"])
        ]
        return tokenizer(inputs, padding="max_length", truncation=True, max_length=max_length)


    print("Tokenizing dataset")
    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    print("Adding labels to tokenized dataset")
    def add_labels(examples):
        return {"labels": examples["label"]}

    tokenized_dataset = tokenized_dataset.map(add_labels, batched=True)

    print("Formatting dataset")
    tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    os.makedirs(output_dir, exist_ok=True)

    print(f"Saving processed features to {output_dir}")
    tokenized_dataset.save_to_disk(output_dir)
    print(f"Features saved successfully in {output_dir}.")


if __name__ == "__main__":
    INPUT_DIR = "./data/raw"
    OUTPUT_DIR = "./data/features"
    MODEL_NAME = "nbroad/bigbird-base-health-fact"
    MAX_LENGTH = 512

    preprocess_dataset(INPUT_DIR, OUTPUT_DIR, MODEL_NAME, MAX_LENGTH)


Loading raw dataset from ./data/raw...
Initializing tokenizer for nbroad/bigbird-base-health-fact...
Tokenizing dataset...


Map: 100%|██████████| 9832/9832 [00:01<00:00, 7898.11 examples/s]
Map: 100%|██████████| 1225/1225 [00:00<00:00, 7738.43 examples/s]
Map: 100%|██████████| 1235/1235 [00:00<00:00, 8040.42 examples/s]


Adding labels to tokenized dataset...


Map: 100%|██████████| 9832/9832 [00:00<00:00, 276552.46 examples/s]
Map: 100%|██████████| 1225/1225 [00:00<00:00, 134232.63 examples/s]
Map: 100%|██████████| 1235/1235 [00:00<00:00, 197603.02 examples/s]


Formatting dataset for PyTorch...
Saving processed features to ./data/features...


Saving the dataset (1/1 shards): 100%|██████████| 9832/9832 [00:00<00:00, 408309.05 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1225/1225 [00:00<00:00, 244714.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1235/1235 [00:00<00:00, 269173.01 examples/s]

Features saved successfully in ./data/features.





In [9]:
import os
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer

INPUT_DIR = "./../data/raw"
input_dir = INPUT_DIR
print(f"Loading raw dataset from {input_dir}...")
dataset = DatasetDict({
    "train": load_dataset("csv", data_files=os.path.join(input_dir, "train.csv"))["train"],
    "validation": load_dataset("csv", data_files=os.path.join(input_dir, "validation.csv"))["validation"],
    "test": load_dataset("csv", data_files=os.path.join(input_dir, "test.csv"))["test"]
})

Loading raw dataset from ./../data/raw...


KeyError: 'validation'

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 9832
    })
    validation: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 1225
    })
    test: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 1235
    })
})