# Datasets

## How to load custom dataset


In [None]:
from datasets import load_dataset

# argument to pass to pandas.read_csv()
# data_files could also be a url

# csv
local_csv = load_dataset("csv", data_files="path-to-file.csv", sep=",")

# json single obj
local_csv = load_dataset("json", data_files="path-to-file.csv", field="data")

# json multiple files
data_files = {"train": f"{url}train.json", "test": f"{url}test-json"}
local_csv = load_dataset("json", data_files=data_files, field="data")

In [None]:
# train_test_split
dataset = squad.train_test_split(test_size = 0.1)

# select and shuffle
indices = [0,10,20,40, 15]
squad.shuffle().select(indices)

# filter the dataframe
squad_filtered = squad.filter(lambda x:x["title"].startswith("L"))

# flatten
squad.flatten()
# we have that answers is nested into text and answer_start, with flatten we bring them out ['answers.text', 'answers.answer_start'].

# map 
def lower_case(ex):
    return {"title": ex["title"].lower()}

squad_lower = squad.map(lower_case, batched=True)
## Using Dataset.map() with batched=True will be essential to unlock the speed of the “fast” tokenizers 

# Renaming and filtering
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
drug_dataset = drug_dataset.rename_column(original_column_name="Unnamed: 0", new_column_name="patient_id")
drug_dataset = drug_dataset.map(lowercase_condition)

# Create new columns
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)

# sort values
drug_dataset["train"].sort("review_length")[:3]

# removing emoji
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})


## Pandas integration

In [None]:
# convert into pandas dataframe!
dataset.set_format("pandas")
# easier way
dataset.to_pandas()
# back to original
dataset.reset_format()
# or
from datasets import Dataset
freq_dataset = Dataset.from_pandas(frequencies)

In [None]:
# create train-test-validation

drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]

In [None]:
# save and load datasets

# arrow format
drug_dataset_clean.save_to_disk("path") # save
drug_arrow_load = load_from_disk("path") # load

# csv
for split, dataset in raw_dataset.items():
    dataset.to_csv(f"myDataset-{split}.csv", index=None) #save

data_files  = {"train": "myDataset-train.csv", "test": "myDataset-test.csv" }
load_dataset("csv", data_files=data_files)