**The practice parts are the records of my practice**

# What if my dataset isn't on the Hub?

## Practice

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

In [None]:
!gzip -dkv SQuAD_it-*.json.gz

In [None]:
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")

In [None]:
squad_it_dataset

In [None]:
squad_it_dataset["train"][0]

In [None]:
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz", 
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

### Try it out! Load any dataset you want!

In [None]:
# I loaded iris dataset. 
data_files = {
    "train": "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv", 
}
squad_it_dataset = load_dataset("csv", data_files=data_files)
squad_it_dataset

## Example

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

In [None]:
!gzip -dkv SQuAD_it-*.json.gz

In [None]:
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")

In [None]:
squad_it_dataset

In [None]:
squad_it_dataset["train"][0]

In [None]:
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

# Time to slice and dice

## Practice

In [None]:
! pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [1]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}

drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [2]:
drug_sample = drug_dataset["train"].shuffle(seed=12).select(range(1000))

drug_sample[:3]

{'Unnamed: 0': [109112, 113915, 138686],
 'drugName': ['Nexplanon', 'Tegretol', 'Phentermine'],
 'condition': ['Birth Control', 'Bipolar Disorde', 'Weight Loss'],
 'review': ['"This birth control has done me wonder. Yes it effects people entirely different. Some people bleed the entire time, some people&#039;s mental health can be affected by it, but it&#039;s rare for that. Just like any birth control you can lose weight or gain weight. It can make you Moody or not. IT ALL DEPENDS ON THE PERSON. For me I had no issues. August 2017 will be three years and I still have yet to get pregnant on this birth control.  If I was back on the pill, I would have been a teen Mom already. I recommend it but it&#039;s not for everyone."',
  '"I liked it more than valproate since it doesn&#039;t affect your weight."',
  '"I used Adipex for 4 months and went from a size 20 to a size 12/14. I stopped for 5 months and went back to a 17, I plan on taking it again. It made me want to clean everything spotl

In [3]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [4]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [5]:
'''
    Try it out!
    Use the Dataset.unique() function to find the number of unique drugs and conditions in the training and test sets.
'''

for split in drug_dataset.keys():
    features = drug_dataset[split][0].keys()
    for feature in features:
        print(f"feature: {feature}, num_unique: {len(drug_dataset[split].unique(feature))}")

feature: patient_id, num_unique: 161297
feature: drugName, num_unique: 3436
feature: condition, num_unique: 885
feature: review, num_unique: 112329
feature: rating, num_unique: 10
feature: date, num_unique: 3579
feature: usefulCount, num_unique: 389
feature: patient_id, num_unique: 53766
feature: drugName, num_unique: 2637
feature: condition, num_unique: 709
feature: review, num_unique: 48280
feature: rating, num_unique: 10
feature: date, num_unique: 3566
feature: usefulCount, num_unique: 325


In [5]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

# drug_dataset.map(lowercase_condition) # AttributeError

In [7]:
def filter_nones(x):
    return x["condition"] is not None

In [11]:
(lambda x: x * x)(3)

9

In [12]:
(lambda base, height: 0.5 * base * height)(4, 8)

16.0

In [6]:
# drug_dataset = drug_dataset.filter(filter_nones)
# # or 
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [7]:
# drug_dataset = drug_dataset.map(lambda x: {"condition": x["condition"].lower()})
## or
drug_dataset = drug_dataset.map(lowercase_condition)

drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [8]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [9]:
drug_dataset = drug_dataset.map(compute_review_length)

drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [10]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [11]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

{'train': 138514, 'test': 46108}


In [33]:
'''
    Try it out! 
    Use the Dataset.sort() function to inspect the reviews with the largest numbers of words (descending order). 
'''

## Note: If you run this code after executing the code immediately above (filter ( > 30)), it will stuck (I presume it takes too much time!). 
# drug_dataset["train"].sort("review_length", reverse=True)[:3]

# The proof is this code! (It takes too much time.) (The code below works!)
drug_dataset["train"].select(range(1000)).sort("review_length", reverse=True)[:3]


{'patient_id': [102449, 174197, 137926],
 'drugName': ['Aripiprazole', 'Versed', 'Ortho Evra'],
 'condition': ['schizophrenia', 'light anesthesia', 'birth control'],
 'review': ['"Abilify 20 mg.\r\nI am a patient diagnosed with disorganized schizophrenia, depression,  schizoaffective disorder, bipolar.  I have experienced a sensitivity to my emotions, as well as how I react to my feelings.  I really don&#039;t feel out of the normal with any &quot;sexual frustration&quot;...and I wouldn&#039;t say this has increased/decreased.  I feel less anxious on the medication, and seemingly more at ease with myself when I take this medication.  It is a hard step to have people telling you that you actually do better on this pill, when half your life ago, you didn&#039;t have &quot;mental illness&quot;, and suddenly you become someone else.   My weight did fluxuate when on this drug.  But I feel it was due to stress factors outside of a regular environment.  I feel if you place yourself in good su

In [12]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [13]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

In [14]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [16]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 21.8 s, sys: 157 ms, total: 21.9 s
Wall time: 6.76 s


In [17]:
'''
    Try it out! 
    Execute the same instruction with and without batched=True, then try it with a slow tokenizer (add use_fast=False in the AutoTokenizer.from_pretrained() method) so you can see what numbers you get on your hardware.
'''

slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)

def slow_tokenize_function(example):
    return slow_tokenizer(example["review"], truncation=True)

print("fast, batched, and multiprocessing (8)")
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True, num_proc=8)
print()

print("fast, unbatched, and multiprocessing (8)")
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False, num_proc=8)
print()

print("fast, batched")
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)
print()

print("fast and unbatched")
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)
print()

print("slow, batched, and multiprocessing (8)")
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)
print()

print("slow, unbatched, and multiprocessing (8)")
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=False, num_proc=8)
print()

print("slow and batched")
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True)
print()

print("slow and unbatched")
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=False)

# The reason why use_fast is fast: They use fast Rust-based tokenizer not Python-based tokenizer if it is supported for a given model. (Ref. https://huggingface.co/docs/transformers/model_doc/auto)

fast, batched, and multiprocessing (8)
CPU times: user 79.4 ms, sys: 21 µs, total: 79.4 ms
Wall time: 79.3 ms

fast, unbatched, and multiprocessing (8)
CPU times: user 76.9 ms, sys: 3.87 ms, total: 80.8 ms
Wall time: 80.8 ms

fast, batched
CPU times: user 27.8 ms, sys: 3.95 ms, total: 31.7 ms
Wall time: 31.8 ms

fast and unbatched
CPU times: user 24.1 ms, sys: 17 µs, total: 24.1 ms
Wall time: 23.9 ms

slow, batched, and multiprocessing (8)


Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 1.25 s, sys: 252 ms, total: 1.5 s
Wall time: 1min 5s

slow, unbatched, and multiprocessing (8)


Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 3.61 s, sys: 491 ms, total: 4.11 s
Wall time: 1min 15s

slow and batched


Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 3min 55s, sys: 769 ms, total: 3min 55s
Wall time: 3min 55s

slow and unbatched


Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 4min 41s, sys: 2.51 s, total: 4min 44s
Wall time: 4min 41s


In [99]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [100]:
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

[128, 49]

In [None]:
# # Occur ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1463
# tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [25]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [26]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

(206772, 138514)

In [102]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [105]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True) # You can use num_proc argument.
tokenized_dataset

Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

In [127]:
drug_dataset.set_format("pandas")

In [128]:
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [205]:
train_df = drug_dataset["train"][:]

In [210]:
# I changed the origianl code a bit.
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"count": "frequency"}) # this part! I don't know why they used columns={"index": "condition", "condition": "frequency"}
)
frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [211]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [204]:
'''
    Try it out! 
    Compute the average rating per drug and store the result in a new Dataset.
'''
from datasets import Dataset

average_rating_per_drug = (
    train_df
    .groupby("drugName")
    .agg({"rating": "mean"})
    .reset_index()
)
print(average_rating_per_drug.head())

average_rating_per_drug_dataset = Dataset.from_pandas(average_rating_per_drug)
print(average_rating_per_drug_dataset)

                               drugName     rating
0             A + D Cracked Skin Relief  10.000000
1                            A / B Otic  10.000000
2  Abacavir / dolutegravir / lamivudine   7.953488
3    Abacavir / lamivudine / zidovudine   9.000000
4                             Abatacept   7.312500
Dataset({
    features: ['drugName', 'rating'],
    num_rows: 3052
})


In [212]:
drug_dataset.reset_format()

## Example

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [None]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [None]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]

In [None]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [None]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

In [None]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}


drug_dataset.map(lowercase_condition)

In [None]:
def filter_nones(x):
    return x["condition"] is not None

In [None]:
(lambda x: x * x)(3)

In [None]:
(lambda base, height: 0.5 * base * height)(4, 8)

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [None]:
drug_dataset = drug_dataset.map(lowercase_condition)
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

In [None]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [None]:
drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

In [None]:
drug_dataset["train"].sort("review_length")[:3]

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

In [None]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

In [None]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

In [None]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [None]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

In [None]:
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)


def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)


tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [None]:
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

In [None]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

In [None]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

In [None]:
drug_dataset.set_format("pandas")

In [None]:
drug_dataset["train"][:3]

In [None]:
train_df = drug_dataset["train"][:]

In [None]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

In [None]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

In [None]:
drug_dataset.reset_format()

In [None]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

In [None]:
drug_dataset_clean.save_to_disk("drug-reviews")

In [None]:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

In [None]:
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

In [None]:
!head -n 1 drug-reviews-train.jsonl

In [None]:
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)