In [None]:
from datasets import load_dataset

issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

In [31]:
issues_dataset = issues_dataset.filter(
    lambda x: x['is_pull_request'] == False and len(x['comments']) > 0
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [32]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [77]:
issues_dataset[0:2]

{'html_url': ['https://github.com/huggingface/datasets/issues/2945',
  'https://github.com/huggingface/datasets/issues/2943'],
 'title': ['Protect master branch',
  'Backwards compatibility broken for cached datasets that use `.filter()`'],
 'comments': [['Cool, I think we can do both :)',
   '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).'],
  ["Hi ! I guess the caching mechanism should have considered the new `filter` to be different from the old one, and don't use cached results from the old `filter`.\r\nTo avoid other users from having this issue we could make the caching differentiate the two, what do you think ?",
   "If

In [84]:
from datasets import Dataset

def explode(batch):
    exploded = []
    for i in range(len(batch['comments'])):
        for idx in range(len(batch['comments'][i])):
            new_item = {}
            for key in batch:
                if key == 'comments':
                    new_item[key] = batch[key][i][idx]
                else:
                    new_item[key] = batch[key][i]
            exploded.append(new_item)
    return {k: [dic[k] for dic in exploded] for k in exploded[0]}

comments_dataset = issues_dataset.map(explode, batched=True)


Map:   0%|          | 0/808 [00:00<?, ? examples/s]

In [76]:
# Filter out really short comments that most often are not helpful
comments_dataset = comments_dataset.map(
    lambda x: {'comment_length': len(x['comments'].split())}
)

comments_dataset = comments_dataset.filter(
        lambda x: x['comment_length'] > 15
)

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

In [None]:
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

In [86]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
import torch

device = torch.device("cuda")
model.to(device)

In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_states[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding = True, truncation = True, return_tensors = "pt"
    )
    encoded_input = {k: v.to(device) for k,v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)


In [None]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x['text']).detach().cpu().numpy()[0]}
)


In [None]:
embeddings_dataset.add_faiss_index(column = "embeddings")

question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k = 5
)

In [None]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending = False, inplace = True)

for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()
