In [1]:
from datasets import load_dataset

issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [2]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 3019
})

In [3]:
issues_dataset[:5]

{'html_url': ['https://github.com/huggingface/datasets/pull/2955',
  'https://github.com/huggingface/datasets/pull/2954',
  'https://github.com/huggingface/datasets/pull/2952',
  'https://github.com/huggingface/datasets/pull/2951',
  'https://github.com/huggingface/datasets/pull/2950'],
 'title': ['Update legacy Python image for CI tests in Linux',
  'Run tests in parallel',
  'Fix missing conda deps',
  'Dummy labels no longer on by default in `to_tf_dataset`',
  'Fix fn kwargs in filter'],
 'comments': [[],
  ['There is a speed up in Windows machines:\r\n- From `13m 52s` to `11m 10s`\r\n\r\nIn Linux machines, some workers crash with error message:\r\n```\r\nOSError: [Errno 12] Cannot allocate memory\r\n```',
   'There is also a speed up in Linux machines:\r\n- From `7m 30s` to `5m 32s`'],
  [],
  ["@lhoestq Let me make sure we never need it, and if not then I'll remove it entirely in a follow-up PR.",
   'Thanks ;) it will be less confusing and easier to maintain to not keep unused h

In [4]:
issues_dataset = issues_dataset.filter(lambda x: len(x["comments"]) >0)
issues_dataset = issues_dataset.filter(lambda x: type(x["body"]) is not None)
issues_dataset = issues_dataset.filter(lambda x: len(x["title"]) >0)

In [5]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]

In [6]:
df["comments"][0].tolist()

['There is a speed up in Windows machines:\r\n- From `13m 52s` to `11m 10s`\r\n\r\nIn Linux machines, some workers crash with error message:\r\n```\r\nOSError: [Errno 12] Cannot allocate memory\r\n```',
 'There is also a speed up in Linux machines:\r\n- From `7m 30s` to `5m 32s`']

In [7]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/pull/2954,Run tests in parallel,There is a speed up in Windows machines:\r\n- ...,Run CI tests in parallel to speed up the test ...
1,https://github.com/huggingface/datasets/pull/2954,Run tests in parallel,There is also a speed up in Linux machines:\r\...,Run CI tests in parallel to speed up the test ...
2,https://github.com/huggingface/datasets/pull/2951,Dummy labels no longer on by default in `to_tf...,"@lhoestq Let me make sure we never need it, an...","After more experimentation, I think I have a w..."
3,https://github.com/huggingface/datasets/pull/2951,Dummy labels no longer on by default in `to_tf...,Thanks ;) it will be less confusing and easier...,"After more experimentation, I think I have a w..."


In [8]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 6061
})

In [9]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)

Map:   0%|          | 0/6061 [00:00<?, ? examples/s]

In [10]:
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 37)
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] < 40)
comments_dataset

Filter:   0%|          | 0/6061 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2274 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 107
})

In [11]:
def concatenate_text(examples):
    if examples["comments"] is None:
        examples["comments"] = ""
    if examples["body"] is None:
        examples["body"] = ""
    if examples["title"] is None:
        examples["title"] = ""
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [12]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [13]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [14]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [15]:
import torch

device = torch.device("cpu")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [16]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [17]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Notice that we’ve converted the embeddings to NumPy arrays — that’s because 🤗 Datasets requires this format when we try to index them with FAISS, which we’ll do next.

In [18]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 107
})

In [19]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [20]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [21]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [22]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: Hi @kswamy15, thanks for reporting.

We are fixing this critical issue and making an urgent patch release of the `datasets` library today.

In the meantime, you can circumvent this issue by updating the `tqdm` library: `!pip install -U tqdm`
SCORE: 42.29082489013672
TITLE: from datasets import Dataset is failing 
URL: https://github.com/huggingface/datasets/issues/2700

COMMENT: Followup: 
From the info in https://github.com/huggingface/datasets/pull/722, I probably should load the videos as array of frames directly into the database. 
This will make the dataset generation time very long, but will make working with the dataset much easier.
SCORE: 41.797019958496094
TITLE: dataset(wlasl): initial loading script
URL: https://github.com/huggingface/datasets/pull/732

COMMENT: Hello @thomwolf 
Thanks for the feedback and for this invitation, indeed I would be glad to join you guys (you can add me). 
I will see if I have the time to implement a couple of datasets. 
Cheers! 
SCORE: 