In [1]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd

In [2]:
issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [3]:
# Remove pull requests and remove issues with no comments
issues_dataset = issues_dataset.filter(lambda x: x["is_pull_request"] == False and len(x["comments"]) > 0)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [4]:
df = issues_dataset.to_pandas()
df = df[["title", "body", "html_url", "comments"]]
issues_dataset = Dataset.from_pandas(df)

print(issues_dataset)
df["comments"][0].tolist()

Dataset({
    features: ['title', 'body', 'html_url', 'comments'],
    num_rows: 808
})


['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [5]:
# Explode data to make a seperate row for each comment in the issue
comments_df = df.explode("comments", ignore_index=True)
comments_df

Unnamed: 0,title,body,html_url,comments
0,Protect master branch,After accidental merge commit (91c55355b634d0d...,https://github.com/huggingface/datasets/issues...,"Cool, I think we can do both :)"
1,Protect master branch,After accidental merge commit (91c55355b634d0d...,https://github.com/huggingface/datasets/issues...,@lhoestq now the 2 are implemented.\r\n\r\nPle...
2,Backwards compatibility broken for cached data...,## Describe the bug\r\nAfter upgrading to data...,https://github.com/huggingface/datasets/issues...,Hi ! I guess the caching mechanism should have...
3,Backwards compatibility broken for cached data...,## Describe the bug\r\nAfter upgrading to data...,https://github.com/huggingface/datasets/issues...,"If it's easy enough to implement, then yes ple..."
4,Backwards compatibility broken for cached data...,## Describe the bug\r\nAfter upgrading to data...,https://github.com/huggingface/datasets/issues...,Well it can cause issue with anyone that updat...
...,...,...,...,...
2959,Issue to read a local dataset,"Hello,\r\n\r\nAs proposed by @thomwolf, I open...",https://github.com/huggingface/datasets/issues/2,My first bug report ❤️\r\nLooking into this ri...
2960,Issue to read a local dataset,"Hello,\r\n\r\nAs proposed by @thomwolf, I open...",https://github.com/huggingface/datasets/issues/2,"Ok, there are some news, most good than bad :l..."
2961,Issue to read a local dataset,"Hello,\r\n\r\nAs proposed by @thomwolf, I open...",https://github.com/huggingface/datasets/issues/2,"Ok great, so as discussed today, let's:\r\n- h..."
2962,Issue to read a local dataset,"Hello,\r\n\r\nAs proposed by @thomwolf, I open...",https://github.com/huggingface/datasets/issues/2,Good plan!\r\n\r\nYes I do use `builder_kwargs...


In [6]:
comments_dataset = Dataset.from_pandas(comments_df)

# Filter out short comments
comments_dataset = comments_dataset.filter(lambda x: len(x["comments"]) > 15)
comments_dataset

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Dataset({
    features: ['title', 'body', 'html_url', 'comments'],
    num_rows: 2898
})

In [7]:
# Create text embeddings
checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [8]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    # Tokenize text
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    
    # Move all tensors in encoded_input to GPU (i.e. "input_ids", "token_type_ids", "attention_mask")
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    
    # Get model output (AKA embeddings)
    model_output = model(**encoded_input)
    
    # We only need the embeddings from the CLS token 
    # which is the first token in the sequence and
    # holds the meaning of the entire sequence
    return cls_pooling(model_output)

# Add text column (combined title, body, comments)
text_dataset = comments_dataset.map(lambda x: {
	"text": x["title"] + "\n" + x["body"] + "\n" + x["comments"]
})

embeddings_dataset = text_dataset.map(
    lambda x: {"embeddings": [get_embeddings(o).detach().cpu().numpy()[0] for o in x["text"]]},
    batched=True,
	batch_size=128,
)

Map:   0%|          | 0/2898 [00:00<?, ? examples/s]

Map:   0%|          | 0/2898 [00:00<?, ? examples/s]

In [9]:
# FAISS search
embeddings_dataset.add_faiss_index(column="embeddings")
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

  0%|          | 0/3 [00:00<?, ?it/s]

(1, 768)

In [10]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)
samples_df

Unnamed: 0,title,body,html_url,comments,text,embeddings,scores
4,Discussion using datasets in offline mode,"`datasets.load_dataset(""csv"", ...)` breaks if ...",https://github.com/huggingface/datasets/issues...,Requiring online connection is a deal breaker ...,Discussion using datasets in offline mode\n`da...,"[-0.47318094968795776, 0.24578382074832916, -0...",25.505016
3,Discussion using datasets in offline mode,"`datasets.load_dataset(""csv"", ...)` breaks if ...",https://github.com/huggingface/datasets/issues...,"The local dataset builders (csv, text , json a...",Discussion using datasets in offline mode\n`da...,"[-0.44908520579338074, 0.2095070332288742, -0....",24.555531
2,Discussion using datasets in offline mode,"`datasets.load_dataset(""csv"", ...)` breaks if ...",https://github.com/huggingface/datasets/issues...,I opened a PR that allows to reload modules th...,Discussion using datasets in offline mode\n`da...,"[-0.47164809703826904, 0.2902272641658783, -0....",24.148987
1,Discussion using datasets in offline mode,"`datasets.load_dataset(""csv"", ...)` breaks if ...",https://github.com/huggingface/datasets/issues...,"> here is my way to load a dataset offline, bu...",Discussion using datasets in offline mode\n`da...,"[-0.4992601275444031, 0.22699803113937378, -0....",22.894005
0,Discussion using datasets in offline mode,"`datasets.load_dataset(""csv"", ...)` breaks if ...",https://github.com/huggingface/datasets/issues...,"here is my way to load a dataset offline, but ...",Discussion using datasets in offline mode\n`da...,"[-0.49025753140449524, 0.22889599204063416, -0...",22.406654
