**The practice parts are the records of my practice**

# What if my dataset isn't on the Hub?

## Practice

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

In [None]:
!gzip -dkv SQuAD_it-*.json.gz

In [None]:
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")

In [None]:
squad_it_dataset

In [None]:
squad_it_dataset["train"][0]

In [None]:
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz", 
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

### Try it out! Load any dataset you want!

In [None]:
# I loaded iris dataset. 
data_files = {
    "train": "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv", 
}
squad_it_dataset = load_dataset("csv", data_files=data_files)
squad_it_dataset

## Example

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

In [None]:
!gzip -dkv SQuAD_it-*.json.gz

In [None]:
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json", field="data")

In [None]:
squad_it_dataset

In [None]:
squad_it_dataset["train"][0]

In [None]:
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

In [None]:
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

# Time to slice and dice

## Practice

In [None]:
! pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [1]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}

drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [2]:
drug_sample = drug_dataset["train"].shuffle(seed=12).select(range(1000))

drug_sample[:3]

{'Unnamed: 0': [109112, 113915, 138686],
 'drugName': ['Nexplanon', 'Tegretol', 'Phentermine'],
 'condition': ['Birth Control', 'Bipolar Disorde', 'Weight Loss'],
 'review': ['"This birth control has done me wonder. Yes it effects people entirely different. Some people bleed the entire time, some people&#039;s mental health can be affected by it, but it&#039;s rare for that. Just like any birth control you can lose weight or gain weight. It can make you Moody or not. IT ALL DEPENDS ON THE PERSON. For me I had no issues. August 2017 will be three years and I still have yet to get pregnant on this birth control.  If I was back on the pill, I would have been a teen Mom already. I recommend it but it&#039;s not for everyone."',
  '"I liked it more than valproate since it doesn&#039;t affect your weight."',
  '"I used Adipex for 4 months and went from a size 20 to a size 12/14. I stopped for 5 months and went back to a 17, I plan on taking it again. It made me want to clean everything spotl

In [3]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [4]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [5]:
'''
    Try it out!
    Use the Dataset.unique() function to find the number of unique drugs and conditions in the training and test sets.
'''

for split in drug_dataset.keys():
    features = drug_dataset[split][0].keys()
    for feature in features:
        print(f"feature: {feature}, num_unique: {len(drug_dataset[split].unique(feature))}")

feature: patient_id, num_unique: 161297
feature: drugName, num_unique: 3436
feature: condition, num_unique: 885
feature: review, num_unique: 112329
feature: rating, num_unique: 10
feature: date, num_unique: 3579
feature: usefulCount, num_unique: 389
feature: patient_id, num_unique: 53766
feature: drugName, num_unique: 2637
feature: condition, num_unique: 709
feature: review, num_unique: 48280
feature: rating, num_unique: 10
feature: date, num_unique: 3566
feature: usefulCount, num_unique: 325


In [5]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

# drug_dataset.map(lowercase_condition) # AttributeError

In [7]:
def filter_nones(x):
    return x["condition"] is not None

In [11]:
(lambda x: x * x)(3)

9

In [12]:
(lambda base, height: 0.5 * base * height)(4, 8)

16.0

In [6]:
# drug_dataset = drug_dataset.filter(filter_nones)
# # or 
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [7]:
# drug_dataset = drug_dataset.map(lambda x: {"condition": x["condition"].lower()})
## or
drug_dataset = drug_dataset.map(lowercase_condition)

drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [8]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [9]:
drug_dataset = drug_dataset.map(compute_review_length)

drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [10]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [11]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

{'train': 138514, 'test': 46108}


In [33]:
'''
    Try it out! 
    Use the Dataset.sort() function to inspect the reviews with the largest numbers of words (descending order). 
'''

## Note: If you run this code after executing the code immediately above (filter ( > 30)), it will stuck (I presume it takes too much time!). 
# drug_dataset["train"].sort("review_length", reverse=True)[:3]

# The proof is this code! (It takes too much time.) (The code below works!)
drug_dataset["train"].select(range(1000)).sort("review_length", reverse=True)[:3]


{'patient_id': [102449, 174197, 137926],
 'drugName': ['Aripiprazole', 'Versed', 'Ortho Evra'],
 'condition': ['schizophrenia', 'light anesthesia', 'birth control'],
 'review': ['"Abilify 20 mg.\r\nI am a patient diagnosed with disorganized schizophrenia, depression,  schizoaffective disorder, bipolar.  I have experienced a sensitivity to my emotions, as well as how I react to my feelings.  I really don&#039;t feel out of the normal with any &quot;sexual frustration&quot;...and I wouldn&#039;t say this has increased/decreased.  I feel less anxious on the medication, and seemingly more at ease with myself when I take this medication.  It is a hard step to have people telling you that you actually do better on this pill, when half your life ago, you didn&#039;t have &quot;mental illness&quot;, and suddenly you become someone else.   My weight did fluxuate when on this drug.  But I feel it was due to stress factors outside of a regular environment.  I feel if you place yourself in good su

In [12]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [13]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

In [14]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [16]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 21.8 s, sys: 157 ms, total: 21.9 s
Wall time: 6.76 s


In [17]:
'''
    Try it out! 
    Execute the same instruction with and without batched=True, then try it with a slow tokenizer (add use_fast=False in the AutoTokenizer.from_pretrained() method) so you can see what numbers you get on your hardware.
'''

slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)

def slow_tokenize_function(example):
    return slow_tokenizer(example["review"], truncation=True)

print("fast, batched, and multiprocessing (8)")
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True, num_proc=8)
print()

print("fast, unbatched, and multiprocessing (8)")
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False, num_proc=8)
print()

print("fast, batched")
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)
print()

print("fast and unbatched")
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)
print()

print("slow, batched, and multiprocessing (8)")
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)
print()

print("slow, unbatched, and multiprocessing (8)")
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=False, num_proc=8)
print()

print("slow and batched")
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True)
print()

print("slow and unbatched")
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=False)

# The reason why use_fast is fast: They use fast Rust-based tokenizer not Python-based tokenizer if it is supported for a given model. (Ref. https://huggingface.co/docs/transformers/model_doc/auto)

fast, batched, and multiprocessing (8)
CPU times: user 79.4 ms, sys: 21 µs, total: 79.4 ms
Wall time: 79.3 ms

fast, unbatched, and multiprocessing (8)
CPU times: user 76.9 ms, sys: 3.87 ms, total: 80.8 ms
Wall time: 80.8 ms

fast, batched
CPU times: user 27.8 ms, sys: 3.95 ms, total: 31.7 ms
Wall time: 31.8 ms

fast and unbatched
CPU times: user 24.1 ms, sys: 17 µs, total: 24.1 ms
Wall time: 23.9 ms

slow, batched, and multiprocessing (8)


Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 1.25 s, sys: 252 ms, total: 1.5 s
Wall time: 1min 5s

slow, unbatched, and multiprocessing (8)


Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 3.61 s, sys: 491 ms, total: 4.11 s
Wall time: 1min 15s

slow and batched


Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 3min 55s, sys: 769 ms, total: 3min 55s
Wall time: 3min 55s

slow and unbatched


Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 4min 41s, sys: 2.51 s, total: 4min 44s
Wall time: 4min 41s


In [99]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [100]:
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

[128, 49]

In [None]:
# # Occur ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1463
# tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [25]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [26]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

(206772, 138514)

In [102]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [105]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True) # You can use num_proc argument.
tokenized_dataset

Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

In [127]:
drug_dataset.set_format("pandas")

In [128]:
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [205]:
train_df = drug_dataset["train"][:]

In [210]:
# I changed the origianl code a bit.
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"count": "frequency"}) # this part! I don't know why they used columns={"index": "condition", "condition": "frequency"}
)
frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [211]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [None]:
'''
    Try it out! 
    Compute the average rating per drug and store the result in a new Dataset.
'''
from datasets import Dataset

average_rating_per_drug = (
    train_df
    .groupby("drugName")
    .agg({"rating": "mean"})
    .reset_index()
)
print(average_rating_per_drug.head())

average_rating_per_drug_dataset = Dataset.from_pandas(average_rating_per_drug)
print(average_rating_per_drug_dataset)

In [None]:
drug_dataset.reset_format()

In [None]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=12)

drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")

drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [None]:
drug_dataset_clean.save_to_disk("drug-reviews")

Saving the dataset (0/1 shards):   0%|          | 0/110811 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/27703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46108 [00:00<?, ? examples/s]

In [2]:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [None]:
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

In [None]:
!head -n 1 drug-reviews-train.jsonl

{"patient_id":8515,"drugName":"Zolpidem","condition":"insomnia","review":"\"Ambien has been the only insomnia medicine that has worked consistently for the 7 years I have been taking it. I was on a combination of Ambien and Xanax but lost my healthcare coverage so am on Ambien only. I totally have to thank the Ambien makers because my life was a messy blur of nonsleep. Now, I sleep and sleep well. \"","rating":9.0,"date":"June 12, 2011","usefulCount":9,"review_length":63}


In [None]:
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)
drug_dataset_reloaded

### Try it out!
1. Use the techniques from Chapter 3 to train a classifier that can predict the patient condition based on the drug review.
2. Use the summarization pipeline from Chapter 1 to generate summaries of the reviews.

**1.**

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [1]:
from datasets import load_dataset
data_files = {
    "train": "drugsComTrain_raw.tsv",
    "test": "drugsComTest_raw.tsv",
}
my_drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
my_drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [2]:
for split in my_drug_dataset.keys():
    assert len(my_drug_dataset[split]) == len(my_drug_dataset[split].unique("Unnamed: 0"))

In [3]:
my_drug_dataset = my_drug_dataset.rename_column(
    original_column_name="Unnamed: 0",
    new_column_name="patient_id"
)
my_drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [4]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

In [5]:
my_drug_dataset = my_drug_dataset.filter(lambda x: x["condition"] is not None)
my_drug_dataset = my_drug_dataset.map(lowercase_condition, num_proc=8)
my_drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [6]:
def compute_review_length(example):
    return {"review_length": [len(r.split()) for r in example["review"]]}

In [7]:
my_drug_dataset = my_drug_dataset.map(compute_review_length, batched=True, num_proc=8)
my_drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 53471
    })
})

In [8]:
import html
my_drug_dataset = my_drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]},
    batched=True,
    num_proc=8,
)
my_drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 53471
    })
})

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_and_split(example):
    result = tokenizer(
        example["review"], 
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )

    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in example.items():
        result[key] = [values[i] for i in sample_map]
    return result

my_drug_dataset = my_drug_dataset.map(tokenize_and_split, batched=True, num_proc=8,)
my_drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'attention_mask'],
        num_rows: 224076
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'attention_mask'],
        num_rows: 74735
    })
})

In [10]:
SEED = 12
my_drug_dataset_with_valid = my_drug_dataset["train"].shuffle(seed=SEED).train_test_split(train_size=0.8, seed=SEED)
my_drug_dataset_with_valid["validation"] = my_drug_dataset_with_valid.pop("test")
my_drug_dataset_with_valid["test"] = my_drug_dataset["test"]
my_drug_dataset_with_valid

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'attention_mask'],
        num_rows: 179260
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'attention_mask'],
        num_rows: 44816
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'attention_mask'],
        num_rows: 74735
    })
})

In [11]:
train_condition = my_drug_dataset_with_valid["train"].unique("condition")
validation_condition = my_drug_dataset_with_valid["validation"].unique("condition")
test_condition = my_drug_dataset_with_valid["test"].unique("condition")
conditions = [*train_condition, *validation_condition, *test_condition]

labels = sorted(list(set(conditions)))
num_labels = len(labels)
print(num_labels)

916


In [12]:
# make a labels column
from datasets import Features, Sequence, Value, ClassLabel

class_label = ClassLabel(num_classes=num_labels, names=labels)

def map_condition2label(example):
    return {"labels": [class_label.str2int(e) for e in example["condition"]]}

my_drug_dataset_with_valid = my_drug_dataset_with_valid.map(map_condition2label, batched=True, num_proc=8)
my_drug_dataset_with_valid["train"]

Dataset({
    features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 179260
})

In [13]:
# remove and remain columns for training
my_drug_dataset_with_valid = my_drug_dataset_with_valid.remove_columns(['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'])
my_drug_dataset_with_valid.set_format("torch")
my_drug_dataset_with_valid["train"].column_names

['input_ids', 'attention_mask', 'labels']

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    my_drug_dataset_with_valid["train"], shuffle=True, batch_size=256, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    my_drug_dataset_with_valid["validation"], batch_size=256, collate_fn=data_collator
)

In [22]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([256, 128]),
 'attention_mask': torch.Size([256, 128]),
 'labels': torch.Size([256])}

In [23]:
# Model setting
from accelerate import Accelerator
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW


model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
optimizer = AdamW(model.parameters(), lr=3e-5)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Evaluate model
from tqdm.auto import tqdm
import evaluate

def eval(model, eval_dataloader):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")

    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
        f1_metric.add_batch(predictions=predictions, references=batch["labels"])

    result = {"Accuracy": accuracy_metric.compute(), "F1": f1_metric.compute(average="weighted")}    
    print(result)
    return result


In [25]:
# Train model
from accelerate.utils import set_seed

def train(model, optimizer, train_dataloader, eval_dataloader,
          num_epochs:int=3, mixed_precision:str="fp16", seed:int=42):
    set_seed(seed)

    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    accelerator = Accelerator(mixed_precision=mixed_precision)

    train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
        train_dataloader, eval_dataloader, model, optimizer
    )

    # progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in tqdm(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            # progress_bar.update(1)

    result = eval(model, eval_dataloader)

In [29]:
num_epochs = 5

In [30]:
from accelerate import notebook_launcher

args = (
    model,
    optimizer,
    train_dataloader, 
    eval_dataloader, 
    num_epochs,
    "fp16", 
    SEED,
)
notebook_launcher(train, args, num_processes=1)

Launching training on one GPU.


  0%|          | 0/701 [00:00<?, ?it/s]

  0%|          | 0/701 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

{'Accuracy': {'accuracy': 0.6287932881113888}, 'F1': {'f1': 0.5797806051250725}}


In [36]:
# Save model
# model.save_pretrained("distilroberta-base-condition-classifier") # model_distilroberta-base_text-classification
# tokenizer.save_pretrained("distilroberta-base-condition-classifier") # tokenizer_distilroberta-base

('distilroberta-base-condition-classifier/tokenizer_config.json',
 'distilroberta-base-condition-classifier/special_tokens_map.json',
 'distilroberta-base-condition-classifier/vocab.json',
 'distilroberta-base-condition-classifier/merges.txt',
 'distilroberta-base-condition-classifier/added_tokens.json',
 'distilroberta-base-condition-classifier/tokenizer.json')

**2.**

In [241]:
from transformers import pipeline

test_dataset = drug_dataset_clean["test"]

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

print(test_dataset[0]["review"])
print(summarizer(test_dataset[0]["review"], max_length=130, min_length=30, do_sample=False))

"I've tried a few antidepressants over the years (citalopram, fluoxetine, amitriptyline), but none of those helped with my depression, insomnia & anxiety. My doctor suggested and changed me onto 45mg mirtazapine and this medicine has saved my life. Thankfully I have had no side effects especially the most common - weight gain, I've actually lost alot of weight. I still have suicidal thoughts but mirtazapine has saved me."
[{'summary_text': '"I\'ve tried a few antidepressants over the years (citalopram, fluoxetine, amitriptyline),"'}]


## Example

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [None]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [None]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]

In [None]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [None]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

In [None]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}


drug_dataset.map(lowercase_condition)

In [None]:
def filter_nones(x):
    return x["condition"] is not None

In [None]:
(lambda x: x * x)(3)

In [None]:
(lambda base, height: 0.5 * base * height)(4, 8)

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [None]:
drug_dataset = drug_dataset.map(lowercase_condition)
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

In [None]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [None]:
drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

In [None]:
drug_dataset["train"].sort("review_length")[:3]

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

In [None]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

In [None]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

In [None]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [None]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

In [None]:
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)


def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)


tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [None]:
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

In [None]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

In [None]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

In [None]:
drug_dataset.set_format("pandas")

In [None]:
drug_dataset["train"][:3]

In [None]:
train_df = drug_dataset["train"][:]

In [None]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

In [None]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

In [None]:
drug_dataset.reset_format()

In [None]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

In [None]:
drug_dataset_clean.save_to_disk("drug-reviews")

In [None]:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

In [None]:
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

In [None]:
!head -n 1 drug-reviews-train.jsonl

In [None]:
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

# Big data? 🤗 Datasets to the rescue! 
(18.12.2023.Mon, I skipped the exercise of this part as the URL link of dataset didn't exist! I'll finish this part after they fix it.)

## Practice

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!pip install zstandard

In [6]:
from datasets import load_dataset

data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset("json", data_files=data_files, split="train")
pubmed_dataset

FileNotFoundError: Unable to find 'https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst'

## Example

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!pip install zstandard

In [None]:
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset("json", data_files=data_files, split="train")
pubmed_dataset

In [None]:
pubmed_dataset[0]

In [None]:
!pip install psutil

In [None]:
import psutil

# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

In [None]:
print(f"Number of files in dataset : {pubmed_dataset.dataset_size}")
size_gb = pubmed_dataset.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

In [None]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(pubmed_dataset), batch_size):
    _ = pubmed_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

In [None]:
pubmed_dataset_streamed = load_dataset(
    "json", data_files=data_files, split="train", streaming=True
)

In [None]:
next(iter(pubmed_dataset_streamed))

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

In [None]:
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

In [None]:
dataset_head = pubmed_dataset_streamed.take(5)
list(dataset_head)

In [None]:
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [None]:
law_dataset_streamed = load_dataset(
    "json",
    data_files="https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    split="train",
    streaming=True,
)
next(iter(law_dataset_streamed))

In [None]:
from itertools import islice
from datasets import interleave_datasets

combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed])
list(islice(combined_dataset, 2))

In [None]:
base_url = "https://the-eye.eu/public/AI/pile/"
data_files = {
    "train": [base_url + "train/" + f"{idx:02d}.jsonl.zst" for idx in range(30)],
    "validation": base_url + "val.jsonl.zst",
    "test": base_url + "test.jsonl.zst",
}
pile_dataset = load_dataset("json", data_files=data_files, streaming=True)
next(iter(pile_dataset["train"]))

# Creating your own dataset

## Practice

In [None]:
!pip install requests

In [1]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [2]:
response.status_code

200

In [3]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/6510',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/6510/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/6510/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/6510/events',
  'html_url': 'https://github.com/huggingface/datasets/pull/6510',
  'id': 2046928742,
  'node_id': 'PR_kwDODunzps5iRyiV',
  'number': 6510,
  'title': 'Replace `list_files_info` with `list_repo_tree` in `push_to_hub`',
  'user': {'login': 'mariosasko',
   'id': 47462742,
   'node_id': 'MDQ6VXNlcjQ3NDYyNzQy',
   'avatar_url': 'https://avatars.githubusercontent.com/u/47462742?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/mariosasko',
   'html_url': 'https://github.com/mariosasko',
   'followers_url': 'https://api.github.com/users/mariosasko/followers'

In [4]:
# !pip install  python-dotenv

In [3]:
import os
from dotenv import load_dotenv

load_dotenv(verbose=True)

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [25]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored ad {issues_path}/{repo}-issues.jsonl"
    )

In [26]:
fetch_issues()

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...
Downloaded all the issues for datasets! Dataset stored ad ./datasets-issues.jsonl


In [4]:
import pandas as pd
from datasets import Dataset 

# 19.12.2023 I don't know why...
# The original example occurs the error. (DatasetGenerationError: An error occurred while generating the dataset)
# issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")

# The code below can be a solution using pd.read_json and Dataset.from_pandas instead of load_dataset.
df = pd.read_json('datasets-issues.jsonl', orient='records', lines=True)
issues_dataset = Dataset.from_pandas(df, split='train')
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'],
    num_rows: 6481
})

In [5]:
sample = issues_dataset.shuffle(seed=333).select(range(3))

for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/huggingface/datasets/pull/2894
>> Pull request: {'diff_url': 'https://github.com/huggingface/datasets/pull/2894.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/2894', 'merged_at': '2021-09-10T16:27:44Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/2894.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/2894'}

>> URL: https://github.com/huggingface/datasets/pull/5307
>> Pull request: {'diff_url': 'https://github.com/huggingface/datasets/pull/5307.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/5307', 'merged_at': '2022-11-28T15:27:26Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/5307.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/5307'}

>> URL: https://github.com/huggingface/datasets/issues/5498
>> Pull request: None



In [6]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

Map:   0%|          | 0/6481 [00:00<?, ? examples/s]

In [154]:
'''
    Try it out! 
    Calculate the average time it takes to close issues in 🤗 Datasets.
    For bonus points, calculate the average time it takes to close pull requests.
'''
import pandas as pd

closed_issues_dataset = issues_dataset.filter(lambda x: x["closed_at"] is not None)
closed_issues_dataset.set_format(type='pandas')
print(closed_issues_dataset["closed_at"].mean())

# --- For bonus points!
merged_pull_request_dataset = issues_dataset.filter(lambda x: x["is_pull_request"] and x["pull_request"]["merged_at"] is not None)
merged_pull_request_dataset.set_format(type='pandas')

average_merged_at = []
for m in merged_pull_request_dataset["pull_request"]:
    m["merged_at"] = pd.Timestamp(m["merged_at"])
    average_merged_at.append(m["merged_at"])
series_average_merged_at = pd.Series(average_merged_at)
print(series_average_merged_at.mean())

Filter:   0%|          | 0/6481 [00:00<?, ? examples/s]

2021-12-17 19:29:44.670792704+00:00


Filter:   0%|          | 0/6481 [00:00<?, ? examples/s]

2021-10-27 12:45:44.869551616+00:00


In [13]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',
  'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',
  'id': 897594128,
  'node_id': 'IC_kwDODunzps41gDMQ',
  'user': {'login': 'bhavitvyamalik',
   'id': 19718818,
   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',
   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/bhavitvyamalik',
   'html_url': 'https://github.com/bhavitvyamalik',
   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',
   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',
   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/

In [14]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]

get_comments(2792)

["@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n    def test_load_dataset(self, dataset_name):\r\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n    self.parent.assertTrue(len(dataset[split]) > 0)\r\nE   AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?",
 'Thanks for the help, @albertvillanova! All tests are passing now.']

In [15]:

# The original code does not work because it exceeds GitHub API rate limit.
# To deal with this problem, I added the code to slice the dataset into a size of 4990. 
issues_dataset_for_comments = issues_dataset.shuffle(seed=444).select(range(4990))
issues_with_comments_dataset = issues_dataset_for_comments.map(
    lambda x: {"comments": get_comments(x["number"])},
    num_proc=8
)

Map (num_proc=8):   0%|          | 0/4990 [00:00<?, ? examples/s]

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
issues_with_comments_dataset.push_to_hub("github-issues_chapter5_section4")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

In [19]:
from datasets import load_dataset

remote_dataset = load_dataset("BanUrsus/github-issues_chapter5_section4", split="train")
remote_dataset

Downloading readme:   0%|          | 0.00/6.07k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.70M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/4990 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 4990
})

## Example

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install requests

In [None]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [None]:
response.status_code

In [None]:
response.json()

In [None]:
GITHUB_TOKEN = xxx  # Copy your GitHub token here
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [None]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
# Depending on your internet connection, this can take several minutes to run...
fetch_issues()

In [None]:
issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

In [None]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

In [None]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

In [None]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

In [None]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]


# Test our function works as expected
get_comments(2792)

In [None]:
# Depending on your internet connection, this can take a few minutes...
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
issues_with_comments_dataset.push_to_hub("github-issues")

In [None]:
remote_dataset = load_dataset("lewtun/github-issues", split="train")
remote_dataset

# Semantic search with FAISS (PyTorch)

## Practice

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install faiss-gpu

In [1]:
from datasets import load_dataset

issues_dataset = load_dataset("BanUrsus/github-issues_chapter5_section4", split="train")
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 4990
})

In [2]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'draft', 'pull_request', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 1683
})

In [3]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "comments", "html_url"]
columns_to_remove= set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 1683
})

In [4]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]

In [5]:
df["comments"][0].tolist()

['Gently pinging @lhoestq',
 'Hi ! Interresting :)\r\n\r\nCould you give more details on what kind of separators you would like to use instead ?',
 'In my case, I just want to use `\\n` but not `U+2028`.',
 "Ok I see, maybe there can be a `sep` parameter to allow users to specify what line/paragraph separator they'd like to use",
 'Related to:\r\n- #3729 \r\n- #3910',
 'Thanks for requesting this enhancement. We have recently found a somehow related issue with another dataset:\r\n- #3704\r\n\r\nLet me make a PR proposal.']

In [6]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Text builder with custom separator line bounda...,Gently pinging @lhoestq,**Is your feature request related to a problem...
1,https://github.com/huggingface/datasets/issues...,Text builder with custom separator line bounda...,Hi ! Interresting :)\r\n\r\nCould you give mor...,**Is your feature request related to a problem...
2,https://github.com/huggingface/datasets/issues...,Text builder with custom separator line bounda...,"In my case, I just want to use `\n` but not `U...",**Is your feature request related to a problem...
3,https://github.com/huggingface/datasets/issues...,Text builder with custom separator line bounda...,"Ok I see, maybe there can be a `sep` parameter...",**Is your feature request related to a problem...


In [7]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 6274
})

In [64]:
'''
    Try it out! 
    See if you can use Dataset.map() to explode the comments column of issues_dataset without resorting to the use of Pandas.

    Ref. https://discuss.huggingface.co/t/mapping-1-multi-element-column-of-a-dataset-to-multi-row-dataset-with-1-element-per-row-duplicating-other-features/14481
'''
issues_dataset.set_format()

def explode(batch):
    # result = {}

    # result["html_url"] =  [html_url for i, html_url in enumerate(batch["html_url"]) for _ in batch["comments"][i]]
    # result["title"] = [title for i, title in enumerate(batch["title"]) for _ in batch["comments"][i]]
    # result["comments"] = [comment for comments in batch["comments"] for comment in comments] 
    # result["body"] = [body for i, body in enumerate(batch["body"]) for _ in batch["comments"][i]]

    # return result

    return {
        "html_url":  [html_url for i, html_url in enumerate(batch["html_url"]) for _ in batch["comments"][i]],
        "title": [title for i, title in enumerate(batch["title"]) for _ in batch["comments"][i]],
        "comments": [comment for comments in batch["comments"] for comment in comments],
        "body": [body for i, body in enumerate(batch["body"]) for _ in batch["comments"][i]],
    }

comments_dataset_from_map = issues_dataset.map(
    explode,
    batched=True,
)
comments_dataset_from_map

Map:   0%|          | 0/1683 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 6274
})

In [63]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)

Map:   0%|          | 0/6274 [00:00<?, ? examples/s]

In [65]:
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
comments_dataset

Filter:   0%|          | 0/6274 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 4546
})

In [98]:
# To solve NoneType error, I added a code below
comments_dataset = comments_dataset.filter(lambda x: x["body"] is not None)

def concatenate_text(batch):
    return {
        "text": [
            batch["title"][i]
            + " \n "
            + batch["body"][i]
            + " \n "
            + batch["comments"][i] for i in range(len(batch["title"]))
        ]
    }

comments_dataset = comments_dataset.map(concatenate_text, batched=True)

Filter:   0%|          | 0/4543 [00:00<?, ? examples/s]

Map:   0%|          | 0/4543 [00:00<?, ? examples/s]

In [179]:
from transformers import AutoTokenizer, AutoModel

# model_ckpt = "sentence-transformers/all-mpnet-base-v2" # test another model!
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" # but the results(outputs) of this model suit me!
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [180]:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0): MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_features

In [181]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [182]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [183]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [184]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/4543 [00:00<?, ? examples/s]

In [185]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 4543
})

In [186]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [187]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [188]:
import pandas as pd

sample_df = pd.DataFrame.from_dict(samples)
sample_df["scores"] = scores
sample_df.sort_values("scores", ascending=False, inplace=True)

In [190]:
for _, row in sample_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: Right now the recommended way is to create the dataset on a server with internet connection and then to save it and copy the serialized dataset to the server without internet connection.
SCORE: 33.753211975097656
TITLE: Cannot download dataset_info.json
URL: https://github.com/huggingface/datasets/issues/647

COMMENT: Hi, we are building an offline dataset viewer: https://github.com/Renumics/spotlight
It supports many HF datasets, but currently you have to use it via Pandas:
df=ds.to_pandas()
spotlight.show(df)

Would love to hear from you if that works for your use case. If not, feel free to open an issue on the repo: https://github.com/Renumics/spotlight/issues
SCORE: 33.19186019897461
TITLE: Offline dataset viewer
URL: https://github.com/huggingface/datasets/issues/6139

COMMENT: The download manager supports local directories. You can specify a local directory instead of a url and it should work.
SCORE: 32.753379821777344
TITLE: [Question] Using/adding a local dataset
URL:

In [200]:
'''
    Try it out! 
    Create your own query and see whether you can find an answer in the retrieved documents. 
    You might have to increase the k parameter in Dataset.get_nearest_examples() to broaden the search.
'''

my_question = "How to convert pandas to dataset"
my_question_embedding = get_embeddings([my_question]).cpu().detach().numpy()

my_scores, my_samples = embeddings_dataset.get_nearest_examples(
    "embeddings", my_question_embedding, k=10
)

my_samples_df = pd.DataFrame.from_dict(my_samples)
my_samples_df["scores"] = my_scores
my_samples_df.sort_values("scores", ascending=False, inplace=True)

for _, row in my_samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: Yes I can do that thank you!

Do you think that conceptually my example should work? If not, I'm happy to close this issue. 

If yes, I can start working on it.
SCORE: 31.777238845825195
TITLE: `to_pandas` doesn't take into account format.
URL: https://github.com/huggingface/datasets/issues/4476

COMMENT: Hi! Instead of `with_format(columns=['a', 'b']).to_pandas()`, use `with_format("pandas", columns=["a", "b"])` for easy conversion of the parts of the dataset to pandas via indexing/slicing.

The full code:
```python
from datasets import Dataset

ds = Dataset.from_dict({'a': [1,2,3], 'b': [5,6,7], 'c': [8,9,10]})
pandas_df = ds.with_format("pandas", columns=['a', 'b'])[:]
```
SCORE: 31.27754783630371
TITLE: `to_pandas` doesn't take into account format.
URL: https://github.com/huggingface/datasets/issues/4476

COMMENT: You can use the `remove_columns` parameter in `map` to avoid duplicating the columns (and save disk space) and then concatenate the original dataset with the map

## Example

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install faiss-gpu

In [None]:
from datasets import load_dataset

issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

In [None]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)
issues_dataset

In [None]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

In [None]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]

In [None]:
df["comments"][0].tolist()

In [None]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

In [None]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

In [None]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)

In [None]:
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
comments_dataset

In [None]:
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


comments_dataset = comments_dataset.map(concatenate_text)

In [None]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [None]:
import torch

device = torch.device("cuda")
model.to(device)

In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [None]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

In [None]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [None]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [None]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()