In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

In [None]:
!gzip -dkv SQuAD_it-*.json.gz

In [2]:
squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")

In [3]:
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

In [2]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

## Slicing a dataset

In [5]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2024-06-11 08:09:02--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip.2’

drugsCom_raw.zip.2      [               <=>  ]  41.00M  10.2MB/s    in 4.3s    

2024-06-11 08:09:07 (9.48 MB/s) - ‘drugsCom_raw.zip.2’ saved [42989872]

Archive:  drugsCom_raw.zip
replace drugsComTest_raw.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [3]:
data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Print a random sample

In [4]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [5]:
for split in drug_dataset.keys(): # check if all ids are unique in both test and train splits
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [6]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [7]:
def filter_nones(x):
    return x["condition"] is not None

In [8]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [9]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drug_dataset.map(lowercase_condition)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

`map` function modifies one of the column. It should return a dict of column name and a column values as a dict value. 

In [10]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [11]:
drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [12]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

{'train': 138514, 'test': 46108}


Removing HTMS codes

In [13]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [14]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

Speeding up with `batched=True`

In [15]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [17]:
result = tokenize_function(drug_dataset["train"][0])
result.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [18]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

CPU times: user 8.43 ms, sys: 107 ms, total: 115 ms
Wall time: 2.4 s


In [19]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)

CPU times: user 28.6 ms, sys: 88.8 ms, total: 117 ms
Wall time: 2.42 s


In [20]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True, num_proc=4)

CPU times: user 76.2 ms, sys: 98.5 ms, total: 175 ms
Wall time: 3.42 s


In [21]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True, num_proc=8)

CPU times: user 185 ms, sys: 119 ms, total: 303 ms
Wall time: 3.69 s


In [22]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [23]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map: 100%|██████████| 46108/46108 [00:15<00:00, 2917.87 examples/s]


Other way to deal with the mismatched length problem by making the old columns the same size as the new ones.

In [24]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [25]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

### Switching to pandas

In [26]:
drug_dataset.set_format("pandas")

In [27]:
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


Under the hood we still keep  drug_dataset in Dataset format, we only display pandas. 

In [28]:
type(drug_dataset)

datasets.dataset_dict.DatasetDict

In [30]:
train_df = drug_dataset["train"][:]

Now we can access all pandas functionalities. 

In [31]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,frequency,count
0,Birth Control,27655
1,Depression,8023
2,Acne,5209
3,Anxiety,4991
4,Pain,4744


Now we can bring new pandas table back to Dataset format. 

In [33]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['frequency', 'count'],
    num_rows: 819
})

In [34]:
train_df.columns

Index(['patient_id', 'drugName', 'condition', 'review', 'rating', 'date',
       'usefulCount', 'review_length'],
      dtype='object')

In [37]:
average_rating = (
    train_df
    .groupby("drugName")["rating"]
    .mean()
    .reset_index()
)
average_rating.head()


Unnamed: 0,drugName,rating
0,A + D Cracked Skin Relief,10.0
1,A / B Otic,10.0
2,Abacavir / dolutegravir / lamivudine,7.953488
3,Abacavir / lamivudine / zidovudine,9.0
4,Abatacept,7.3125


In [38]:
average_rating_dataset = Dataset.from_pandas(average_rating)

In [39]:
drug_dataset.reset_format()

### Creating a validation set

In [45]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

This saves dataset in an arrow format, so it creates directory with a bunch of json files with dictionaries. 

In [46]:
drug_dataset_clean.save_to_disk("drug-reviews")

Saving the dataset (1/1 shards): 100%|██████████| 110811/110811 [00:00<00:00, 119254.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 27703/27703 [00:00<00:00, 127280.70 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46108/46108 [00:00<00:00, 560369.99 examples/s]


In [47]:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [48]:
for split, dataset in drug_dataset_clean.items():
    dataset.to_csv(f"drug-reviews-{split}.csv")

Creating CSV from Arrow format: 100%|██████████| 111/111 [00:01<00:00, 55.59ba/s]
Creating CSV from Arrow format: 100%|██████████| 28/28 [00:00<00:00, 60.43ba/s]
Creating CSV from Arrow format: 100%|██████████| 47/47 [00:00<00:00, 80.67ba/s]


In [49]:
data_files = {
    "train": "drug-reviews-train.csv",
    "validation": "drug-reviews-validation.csv",
    "test": "drug-reviews-test.csv",
}
drug_dataset_reloaded = load_dataset("csv", data_files=data_files)

Generating train split: 110811 examples [00:00, 146912.78 examples/s]
Generating validation split: 27703 examples [00:00, 153433.80 examples/s]
Generating test split: 46108 examples [00:00, 138543.92 examples/s]
