In [None]:
from transformers import pipeline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import tqdm

In [None]:
device = 'mps'

In [None]:
def batch_encoding_to_device(batch):
    return {k: v.to(device) for k, v in batch.items()}

## 5. Datasets

### 5.1 What if my dataset isn't on the Hub

In [None]:
from datasets import load_dataset

In [None]:
squad_it_dataset = load_dataset('json', data_files='data/SQuAD_it-train.json', field='data')

In [None]:
squad_it_dataset

In [None]:
squad_it_dataset['train'][0]['paragraphs'][0]

In [None]:
data_files = {'train': 'data/SQuAD_it-train.json', 'test': 'data/SQuAD_it-test.json'}

In [None]:
squad_it_dataset = load_dataset('json', data_files=data_files, field='data')

In [None]:
squad_it_dataset

In [None]:
# Automatically unzip
#data_files = {"train": "data/SQuAD_it-train.json.gz", "test": "data/SQuAD_it-test.json.gz"}
#squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
squad_it_dataset

### 5.2 Time to slice and dice

In [None]:
from datasets import load_dataset

In [None]:
data_files = {'train': 'data/drugsComTrain_raw.tsv', 'test': 'data/drugsComTest_raw.tsv'}

In [None]:
drug_dataset = load_dataset('csv', data_files=data_files, delimiter='\t')

In [None]:
drug_dataset

In [None]:
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(1000))

In [None]:
drug_sample[:3]

In [None]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique('Unnamed: 0'))

In [None]:
drug_dataset = drug_dataset.rename_column(
    original_column_name='Unnamed: 0', new_column_name='patient_id'
)

In [None]:
drug_dataset

In [None]:
def lowercase_condition(example):
    return {'condition': example['condition'].lower()}

In [None]:
# Raises attribute error since some samples are None
#drug_dataset.map(lowercase_condition)

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)

In [None]:
drug_dataset = drug_dataset.map(lowercase_condition)

In [None]:
drug_dataset['train']['condition'][:3]

In [None]:
def compute_review_length(example):
    return {'review_length': len(example['review'].split())}

In [None]:
drug_dataset = drug_dataset.map(compute_review_length)

In [None]:
drug_dataset['train'][0]

In [None]:
drug_dataset['train'].sort('review_length')[:3]

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 30)

In [None]:
drug_dataset.num_rows

In [None]:
sorted(drug_dataset['train']['review_length'])[-3:]

In [None]:
import html

In [None]:
text = "I&#039;m a transformer called BERT"

In [None]:
html.unescape(text)

In [None]:
drug_dataset = drug_dataset.map(lambda x: {'review': html.unescape(x['review'])})

In [None]:
new_drug_dataset = drug_dataset.map(
    lambda x: {'review': [html.unescape(o) for o in x['review']]}, batched=True
)

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['review'], truncation=True)

In [None]:
%time tokenized_data = drug_dataset.map(tokenize_function, batched=True)

In [None]:
%time tokenized_data = drug_dataset.map(tokenize_function, batched=False)

In [None]:
%time tokenized_data = drug_dataset.map(tokenize_function, batched=True, num_proc=10)

In [None]:
def tokenize_and_split(examples):
    return tokenizer(examples['review'], truncation=True, max_length=128, return_overflowing_tokens=True)

In [None]:
result = tokenize_and_split(drug_dataset['train'][0])

In [None]:
[len(inp) for inp in result['input_ids']]

In [None]:
# Raises arrow invalid for mismatched column lengths 
#tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset['train'].column_names
)

In [None]:
len(tokenized_dataset['train']), len(drug_dataset['train'])

In [None]:
drug_dataset['train'].features

In [None]:
tokenized_dataset['train'].features

In [None]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop('overflow_to_sample_mapping')
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
tokenized_dataset

In [None]:
print(tokenized_dataset['train'][:2])

In [None]:
drug_dataset.set_format('pandas')

In [None]:
drug_dataset['train'][:3]

In [None]:
train_df = drug_dataset['train'][:]

In [None]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)

In [None]:
frequencies.head()

In [None]:
from datasets import Dataset

In [None]:
freq_dataset = Dataset.from_pandas(frequencies)

In [None]:
freq_dataset

In [None]:
drug_mean_rating = (
    train_df[['drugName', 'rating']]
    .groupby(['drugName'])
    .mean()
    .sort_values(by='rating')
    .reset_index()
    .rename(columns={'rating':'mean rating'})
)

In [None]:
drug_mean_rating.tail()

In [None]:
drug_dataset.reset_format()

In [None]:
drug_dataset['train'][:3]

In [None]:
drug_dataset_clean = drug_dataset['train'].train_test_split(train_size=0.8, seed=42)

In [None]:
drug_dataset_clean['val'] = drug_dataset_clean.pop('test')

In [None]:
drug_dataset_clean['test'] = drug_dataset['test']

In [None]:
drug_dataset_clean

In [None]:
drug_dataset_clean.save_to_disk('data/drug-reviews')

In [None]:
from datasets import load_from_disk

In [None]:
drug_dataset_reloaded = load_from_disk('data/drug-reviews')

In [None]:
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f'data/drug-reviews-{split}.jsonl')

In [None]:
data_files = {
    'train': 'data/drug-reviews-train.jsonl',
    'val': 'data/drug-reviews-val.jsonl',
    'test': 'data/drug-reviews-test.jsonl',
}

In [None]:
drug_dataset_reloaded = load_dataset('json', data_files=data_files)

In [None]:
drug_dataset_reloaded

### 5.3 Big data

In [None]:
from datasets import load_dataset, DownloadConfig

In [None]:
data_files = "https://huggingface.co/datasets/casinca/PUBMED_title_abstracts_2019_baseline/resolve/main/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset(
    "json",
    data_files=data_files,
    split="train",
    download_config=DownloadConfig(delete_extracted=True),  # (optional arg)using DownloadConfig to save HD space
)

In [None]:
pubmed_dataset

In [None]:
print (pubmed_dataset[10000000]['text'])

In [None]:
import psutil

In [None]:
f'RAM used: {psutil.Process().memory_info().rss / 1024 ** 2 :.2f} MB'

In [None]:
n_files = pubmed_dataset.dataset_size
print (f'Number of files in dataset : {n_files}')
size_gb = n_files / 1024 ** 3
print (f'Dataset size (cache file) : {size_gb:.2f}')

In [None]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(pubmed_dataset), batch_size):
    _ = pubmed_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

In [None]:
pubmed_dataset_streamed = load_dataset(
    "json",
    data_files=data_files,
    split="train",
    download_config=DownloadConfig(delete_extracted=True),  # (optional arg)using DownloadConfig to save HD space
    streaming=True,
)

In [None]:
pubmed_dataset_streamed

In [None]:
next(iter(pubmed_dataset_streamed))

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x['text']))

In [None]:
tokenized_dataset

In [None]:
print(next(iter(tokenized_dataset)))

In [None]:
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10000, seed=42)

In [None]:
print(next(iter(shuffled_dataset)))

In [None]:
dataset_head = pubmed_dataset_streamed.take(5)

In [None]:
list(dataset_head)

In [None]:
train_dataset = shuffled_dataset.skip(1000)
val_dataset = shuffled_dataset.take(1000)

In [None]:
from itertools import islice
from datasets import interleave_datasets

In [None]:
combined_dataset = interleave_datasets([val_dataset, val_dataset])

In [None]:
list(islice(combined_dataset, 2))

### 5.4 Creating your own dataset

In [None]:
import requests

In [None]:
url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [None]:
response.status_code

In [None]:
response.json()

In [None]:
#github_token = 'create personal access token on github'
#headers = {'Authorization': f'token {github_token}'}

In [None]:
import time
import math
from pathlib import Path

def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm.tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
fetch_issues(num_issues=3000, issues_path=Path('data'))

In [None]:
from datasets import load_dataset

In [None]:
issues_dataset = load_dataset("json", data_files="data/datasets-issues.jsonl", split="train")

In [None]:
issues_dataset

In [None]:
sample = issues_dataset.shuffle(seed=42).select(range(10))

In [None]:
for url, pr in zip(sample['html_url'], sample['pull_request']):
    print (f'>> URL: {url}')
    print (f'>> PR: {pr}\n')

In [None]:
issues_dataset = issues_dataset.map(
    lambda x: {'is_pull_request': x['pull_request'] is not None}
)

In [None]:
issues_dataset['is_pull_request'][:6]

In [None]:
issues_dataset['html_url'][:6]

In [None]:
issues_dataset_df = issues_dataset.filter(lambda x: x['is_pull_request'])

In [None]:
issues_dataset_df.set_format('pandas')

In [None]:
issues_dataset_df = issues_dataset_df[:]

In [None]:
(issues_dataset_df['closed_at'] - issues_dataset_df['created_at']).mean()

In [None]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

In [None]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]

In [None]:
get_comments(issue_number)

In [None]:
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {'comments': get_comments(x['number'])}
)

In [None]:
issues_with_comments_dataset.to_json('data/datasets-issues-comments.jsonl')

### 5.5 Semantic search with FAISS

In [None]:
from datasets import load_dataset

In [None]:
# specify train so it returns a dataset instead of datasetdict
#issues_dataset = load_dataset("json", data_files="data/datasets-issues-comments.jsonl", split="train")

In [None]:
issues_dataset = load_dataset("lewtun/github-issues", split="train")

In [None]:
issues_dataset

In [None]:
issues_dataset = issues_dataset.filter(
    lambda x: (x['is_pull_request'] == False and len(x['comments']) > 0)
)

In [None]:
issues_dataset

In [None]:
columns = issues_dataset.column_names

In [None]:
columns_to_keep = ['title', 'body', 'html_url', 'comments']

In [None]:
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)

In [None]:
issues_dataset = issues_dataset.remove_columns(columns_to_remove)

In [None]:
issues_dataset.set_format('pandas')

In [None]:
df = issues_dataset[:]

In [None]:
df['comments'][0].tolist()

In [None]:
comments_df = df.explode('comments', ignore_index=True)

In [None]:
comments_df.head()

In [None]:
from datasets import Dataset

In [None]:
# can also make a dataset from dict
comments_dataset = Dataset.from_pandas(comments_df)

In [None]:
comments_dataset

In [None]:
# Explode without using pandas
#issues_dataset.reset_format()

# Example function to explode the comments
def explode_comments(batch):
    new_batch = {k: [] for k in batch}  # Initialize a new batch dictionary
    for i in range(len(batch['comments'])):
        for comment in batch['comments'][i]:
            for key in batch:
                if key == 'comments':
                    new_batch[key].append(comment)
                else:
                    new_batch[key].append(batch[key][i])
    return new_batch

#batch = issues_dataset[0:1]
#exploded_batch = explode_comments(batch)

# Use map with batched=True and keep the structure consistent
#exploded_dataset = issues_dataset.map(
#    explode_comments,
#    batched=True,  # Process examples in batches
#)

In [None]:
comments_dataset = comments_dataset.map(
    lambda x: {'comment_length': len(x['comments'].split())}
)

In [None]:
comments_dataset = comments_dataset.filter(
    lambda x: x['comment_length'] > 15
)

In [None]:
comments_dataset

In [None]:
def concatenate_text(examples):
    return {
        'text': examples['title']
        + ' \n '
        + examples['body']
        + ' \n '
        + examples['comments']
    }

In [None]:
comments_dataset = comments_dataset.map(concatenate_text)

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
#ckeckpoint = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
checkpoint = 'sentence-transformers/multi-qa-distilbert-cos-v1'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
model = AutoModel.from_pretrained(checkpoint, device_map=device)

In [None]:
model.device

In [None]:
total = 0
for p in model.parameters():
    total += p.numel()
np.log10(total)

In [None]:
# for mpnet
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors='pt'
    )
    encoded_input = batch_encoding_to_device(encoded_input)
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

- unsqueeze: add dimension
- expand: create copies
- clamp: min/max clip

In [None]:
# for distilbert
import torch
import torch.nn.functional as F

# take mean of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    mean_pool = (
        torch.sum(token_embeddings * input_mask_expanded, 1) /
        torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    )
    return mean_pool

def get_embeddings(text_list):
    # tokenize sentences
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors='pt'
    )
    encoded_input = encoded_input.to(device)

    # compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)

    return embeddings

In [None]:
embeddings = get_embeddings(comments_dataset['text'][0])

In [None]:
embeddings.shape

In [None]:
embeddings_dataset = comments_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['text']).detach().cpu().numpy()[0]}
)

In [None]:
embeddings_dataset

In [None]:
embeddings_dataset.add_faiss_index(column='embeddings')

In [None]:
question = 'How can I load a dataset offline?'

In [None]:
question_embedding = get_embeddings([question]).detach().cpu().numpy()

In [None]:
question_embedding.shape

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    'embeddings', question_embedding, k=5
)

In [None]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [None]:
#for _, row in samples_df.iterrows():
#    print(f"COMMENT: {row.comments}")
#    print(f"SCORE: {row.scores}")
#    print(f"TITLE: {row.title}")
#    print(f"URL: {row.html_url}")
#    print("=" * 50)
#    print()

#### Using dot product

In [None]:
query_emb = torch.Tensor(question_embedding).to(device)

In [None]:
query_emb.shape

In [None]:
doc_emb = torch.Tensor(embeddings_dataset['embeddings']).to(device)

In [None]:
doc_emb.shape

In [None]:
scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()

In [None]:
doc_score_pairs = list(zip(embeddings_dataset['text'], scores))

In [None]:
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

In [None]:
for doc, score in doc_score_pairs[:5]:
    print(f"DOC: {doc}")
    print(f"SCORE: {score}")
    print("=" * 50)
    print()