In [None]:
!pip install datasets
!pip install transformers
!pip install faiss-gpu

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from datasets import Dataset
import faiss

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_ds, test_ds = load_dataset("lewtun/github-issues", split=['train[0:10%]', 'train[95%:100%]'])

train_ds = train_ds.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)

columns = train_ds.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
train_ds = train_ds.remove_columns(columns_to_remove)
train_ds.set_format("pandas")
df = train_ds[:]
comments_df = df.explode("comments", ignore_index=True)
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading and preparing dataset json/lewtun--github-issues to /root/.cache/huggingface/datasets/lewtun___json/lewtun--github-issues-cff5093ecc410ea2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/lewtun___json/lewtun--github-issues-cff5093ecc410ea2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Filter:   0%|          | 0/302 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196 [00:00<?, ? examples/s]

In [None]:
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

In [None]:
# Model declaration
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
model.to(device)
print("Model loaded")

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Model loaded


In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
# Create an embeddings dataset
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

In [None]:
# FAISS for efficient similarity search
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 132
})

In [None]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [None]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [None]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)


COMMENT: Thanks for reporting ! #2852 fixed this error

We'll do a new release of `datasets` soon :)
SCORE: 43.938777923583984
TITLE: Cannot load linnaeus dataset
URL: https://github.com/huggingface/datasets/issues/2821
COMMENT: > * For the platform, we need to know the operating system of your machine. Could you please run the command `datasets-cli env` and copy-and-paste its output below?
> * In relation with the error, you just gave us the error type and message (`TypeError: 'NoneType' object is not callable`). Could you please copy-paste the complete stack trace, so that we know exactly which part of the code threw the error?

1. For the platform, here are the output:
        - datasets` version: 1.11.0
        - Platform: Windows-10-10.0.19041-SP0
        - Python version: 3.7.10
        - PyArrow version: 5.0.0
2. For the code and error：
     ```python
     from datasets import load_dataset, load_metric
     dataset = load_dataset("glue", "cola")
    ```
    ```pyt