In [106]:
import os
from pathlib import Path
from datasets import load_dataset
import pandas as pd
import html
from transformers import AutoTokenizer
from itertools import islice
from datasets import interleave_datasets

### Load Dataset Not on Hub

#### Local Dataset

In [13]:
if Path("./Data/SQuAD_it-test.json.gz").is_file() and Path("./Data/SQuAD_it-train.json.gz").is_file():
	print("Files already exist")
 
else:
	!wget2 https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
	!wget2 https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz
	# Move files to data folder
	os.makedirs("Data", exist_ok=True)
	os.replace("SQuAD_it-train.json.gz", "Data/SQuAD_it-train.json.gz")
	os.replace("SQuAD_it-test.json.gz", "Data/SQuAD_it-test.json.gz")

Files already exist


In [None]:
squad_it_dataset = load_dataset(
    "json", 
    data_files={
        "train": "./Data/SQuAD_it-train.json",
        "test": "./Data/SQuAD_it-test.json"
	}, 
    field="data"
)

squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

#### Remote Dataset

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

### Dataset Preparation

In [2]:
if not Path("./Data/drugsComTrain_raw.tsv").is_file() and not Path("./Data/drugsComTest_raw.tsv").is_file():
	!wget2 "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
	os.makedirs("Data", exist_ok=True)
	os.replace("drugsCom_raw.zip", "Data/drugsCom_raw.zip")

else:
	print("File already exists")

File already exists


In [3]:
drug_dataset = load_dataset(
    "csv", 
    data_files={
        "train": "./Data/drugsComTrain_raw.tsv", 
		"test": "./Data/drugsComTest_raw.tsv"
	}, 
    delimiter="\t"
)

print(drug_dataset)

drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Visualize data with pandas
drug_sample.to_pandas()

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})


Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,87571,Naproxen,"Gout, Acute","""like the previous person mention, I&#039;m a ...",9.0,"September 2, 2015",36
1,178045,Duloxetine,ibromyalgia,"""I have taken Cymbalta for about a year and a ...",3.0,"November 7, 2011",13
2,80482,Mobic,Inflammatory Conditions,"""I have been taking Mobic for over a year with...",10.0,"June 5, 2013",128
3,159268,TriNessa,Birth Control,"""I am now 21 and have been on TriNessa since I...",10.0,"October 21, 2010",81
4,205477,Pristiq,Depression,"""I was originally on Prozac 20mg for my depres...",10.0,"July 13, 2013",65
...,...,...,...,...,...,...,...
995,188381,ParaGard,Birth Control,"""Always do research before putting anything in...",1.0,"May 22, 2017",9
996,47072,Thyroid desiccated,Hashimoto's disease,"""Been diagnosed with Hashimoto&#039;s for over...",9.0,"March 30, 2017",33
997,174171,Valium,Anxiety,"""Unlike some benzodiazepines like lorazepam, w...",10.0,"June 12, 2010",122
998,80072,Benzonatate,Cough,"""benzonatate medicine is useless for suppressi...",2.0,"February 10, 2017",16


In [27]:
drug_dataset_cpy = drug_dataset

# Rename first column "Unnamed: 0" to "patient_id"
drug_dataset_cpy = drug_dataset_cpy.rename_column("Unnamed: 0", "patient_id")

# Filter out data where condition is None
drug_dataset_cpy = drug_dataset_cpy.filter(lambda x: x["condition"] is not None)

# Make "condition" lowercase
drug_dataset_cpy = drug_dataset_cpy.map(
	lambda x: {"condition": [o.lower() for o in x["condition"]]},
 	batched=True,
)

# Create new column "review_length"
drug_dataset_cpy = drug_dataset_cpy.map(
	lambda x: {"review_length": [len(o.split()) for o in x["review"]]},
 	batched=True,
)

# Only keep reviews larger than 30
drug_dataset_cpy = drug_dataset_cpy.filter(lambda x: x["review_length"] > 30)

# Convert html codes in reviews to text
drug_dataset_cpy = drug_dataset_cpy.map(
	lambda x: {"review": [html.unescape(o) for o in x["review"]]},
	batched=True,
)

# Tokenize data
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_split(example):
    return tokenizer(
		example["review"],
		truncation=True,
		max_length=128,
		return_overflowing_tokens=True,
	)
    
# Since we return overflowing tokens, 
# there may be more tokens created than there are tokens
# So we remove all other columns and only keep token_ids
drug_dataset_cpy = drug_dataset_cpy.map(
    tokenize_and_split, 
	batched=True, 
	remove_columns=drug_dataset_cpy["train"].column_names,
)

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

#### Train/Val/Test Split

In [76]:
drug_dataset_clean = drug_dataset_cpy["train"].train_test_split(test_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add test to the dataset
drug_dataset_clean["test"] = drug_dataset_cpy["test"]

drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 41354
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 165418
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 68876
    })
})

#### Save/Load Dataset

In [79]:
drug_dataset_clean.save_to_disk("./Data/drug_reviews")
# load_dataset("./Data/drug_reviews")

Saving the dataset (0/1 shards):   0%|          | 0/41354 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/165418 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/68876 [00:00<?, ? examples/s]

### Large Datasets

#### Streaming

In [None]:
data_files = "https://huggingface.co/datasets/casinca/PUBMED_title_abstracts_2019_baseline/resolve/main/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset(
	"json",
	data_files=data_files,
	split="train",
	streaming=True # Avoid downloading the entire dataset
)

# Streamed datasets are iterable objects
next(iter(pubmed_dataset))

{'meta': {'pmid': 11409574, 'language': 'eng'},
 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age. Systematic review of the published literature. Out-patient clinics, emergency departments and hospitalisation wards in 23 health centres from 10 countries. Cohort studies reporting the frequency of hypoxaemia in children under 5 years of age with ALRI, and the association between hypoxaemia and the risk of dying. Prevalence of hypoxaemia measured in children with ARI and relative risks for the association between the severity of illness and the frequency of hypoxaemia, and between hypoxaemia and the risk of dying. Seventeen published studies were found that i

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Since dataset is streamed, map will be run on the fly
tokenized_dataset = pubmed_dataset.map(lambda x: tokenizer(x["text"]))

shuffled_dataset = pubmed_dataset.shuffle(buffer_size=10_000, seed=42)

In [102]:
# Create train / validation splits
train_dataset = pubmed_dataset.skip(1000) # Skip first 1000 and take the rest
validation_dataset = pubmed_dataset.take(1000)

#### Combine datasets

In [105]:
law_dataset = load_dataset(
    "timaeus/pile-freelaw",
    split="train",
    streaming=True,
)

next(iter(law_dataset))

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


{'text': '     The summaries of the Colorado Court of Appeals published opinions\n  constitute no part of the opinion of the division but have been prepared by\n  the division for the convenience of the reader. The summaries may not be\n    cited or relied upon as they are not the official language of the division.\n  Any discrepancy between the language in the summary and in the opinion\n           should be resolved in favor of the language in the opinion.\n\n\n                                                                  SUMMARY\n                                                            February 8, 2018\n\n                                2018COA12\n\nNo. 14CA0144, People v. Trujillo — Criminal Law — Sentencing\n— Probation — Indeterminate Sentence\n\n     A division of the court of appeals considers whether a\n\nColorado statute authorizes imposition of a sentence to an\n\nindeterminate term of probation and whether the defendant was\n\nentitled to the benefit of amendments to

In [113]:
combined_dataset = interleave_datasets([pubmed_dataset, law_dataset])
# Will give error because each dataset has a different schema
# Make sure that all datasets have the same schema or process them separately
# list(islice(combined_dataset, 2))