In [1]:
!pip install -qU \
  datasets==2.14.6 \
  openai==1.2.2 \
  pinecone-client==3.0.1


[notice] A new release of pip available: 22.2.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Dataset Download

We're going to test with a more real world use-case, with messy, imperfect data. We will use the [`jamescalam/ai-arxiv-chunked`](https://huggingface.co/datasets/jamescalam/ai-arxiv-chunked) dataset.

In [2]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")
data

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 153M/153M [00:25<00:00, 6.03MB/s]
Downloading data files: 100%|██████████| 1/1 [00:25<00:00, 25.39s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 23.06it/s]
Generating train split: 41584 examples [00:01, 24394.06 examples/s]


Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

In [3]:
data = data.map(lambda x: {
    "id": f'{x["id"]}-{x["chunk-id"]}',
    "text": x["chunk"],
    "metadata": {
        "title": x["title"],
        "url": x["source"],
        "primary_category": x["primary_category"],
        "published": x["published"],
        "updated": x["updated"],
        "text": x["chunk"],
    }
})
# drop uneeded columns
data = data.remove_columns([
    "title", "summary", "source",
    "authors", "categories", "comment",
    "journal_ref", "primary_category",
    "published", "updated", "references",
    "doi", "chunk-id",
    "chunk"
])
data

Map:   0%|          | 0/41584 [00:00<?, ? examples/s]

Map: 100%|██████████| 41584/41584 [00:09<00:00, 4290.30 examples/s]


Dataset({
    features: ['id', 'text', 'metadata'],
    num_rows: 41584
})

First we define our embedding function.

In [8]:
import openai

openai.api_key = "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"

def embed(docs: list[str], name: str) -> list[list[float]]:
    res = openai.embeddings.create(
        input=docs, model=name
    )
    doc_embeds = [r.embedding for r in res.data]
    return doc_embeds

Initialize connection to Pinecone with free API key + sign up to serverless (for free, you get $100 in credits which will last _forever_)

In [5]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="028a60c0-8fd2-40be-b76a-445eb24a152e")

Setup x2 Pinecone Serverless indices:

In [6]:
import time
from pinecone import ServerlessSpec

spec = ServerlessSpec(cloud='aws', region='us-west-2')

indexes = [
    {
        "name": "text-embedding-ada-002",
        "dimension": 1536
    },
    {
        "name": "text-embedding-3-small",
        "dimension": 1536  # 512
    },
    {
        "name": "text-embedding-3-large",
        "dimension": 3072  # 256
    }
]
# get existing indexes
existing_indexes = pc.list_indexes().names()

# check if index already exists (it shouldn't if this is first time)
for i, index in enumerate(indexes):
    if index["name"] not in existing_indexes:
        # if does not exist, create index
        pc.create_index(
            index["name"],
            dimension=index["dimension"],  # dimensionality of minilm
            metric='dotproduct',
            spec=spec
        )
        # wait for index to be initialized
        while not pc.describe_index(index["name"]).status['ready']:
            time.sleep(1)

    # connect to index
    indexes[i]["index"] = pc.Index(index["name"])
    time.sleep(1)
    # view index stats
    print(indexes[i]["index"].describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


Index everything

In [10]:
from tqdm.auto import tqdm

batch_size = 10000  # how many embeddings we create and insert at once

for index in indexes:
    print(f"Indexing for {index['name']}")
    for i in tqdm(range(0, len(data), batch_size)):
        passed = False
        # find end of batch
        i_end = min(len(data), i+batch_size)
        # create batch
        batch = data[i:i_end]
        embeds = embed(batch["text"], name=index["name"])
        to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
        # upsert to Pinecone
        index["index"].upsert(vectors=to_upsert)

Indexing for text-embedding-ada-002


  0%|          | 0/5 [00:22<?, ?it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for text-embedding-ada-002 in organization org-YVlre9siViE9z5bI9RUkbYcu on tokens per min (TPM): Limit 1000000, Requested 2474179. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

Let's create a `get_docs` function so we can quickly compare results

In [18]:
def get_docs(query: str, top_k: int, model: int) -> list[str]:
    print(f"Getting docs with {indexes[model]['name']}")
    # encode query
    xq = embed([query], name=indexes[model]["name"])[0]
    # search pinecone index
    res = indexes[model]["index"].query(vector=xq, top_k=top_k, include_metadata=True)
    # get doc text
    docs = [x["metadata"]['text'] for x in res["matches"]]
    return docs

Try different embedding models by switching the `model` parameter:

* `0` is `text-embedding-ada-002`
* `1` is `text-embedding-3-small`
* `2` is `text-embedding-3-large`

In [35]:
doc = get_docs(
    query="what is the difference between llama and gpt-4?",
    top_k=5,
    model=2
)
print(">>>")
for d in doc:
    print(d)
    print(">>>")

Getting docs with text-embedding-3-large
>>>
to GPT-3 corresponds to the Stanford Alpaca model. From Figure 3(a), we observe that ( i) For the
“Helpfulness” criterion, GPT-4 is the clear winner with 54.12% of the votes. GPT-3 only wins 19.74%
of the time. ( ii) For the “Honesty” and “Harmlessness” criteria, the largest portion of votes goes
to the tie category, which is substantially higher than the winning categories but GPT-3 (Alpaca) is
slightly superior.
Second, we compare GPT-4-instruction-tuned LLaMA models against the teacher model GPT-4 in
Figure 3(b). The observations are quite consistent over the three criteria: GPT-4-instruction-tuned
LLaMA performs similarly to the original GPT-4. We conclude that learning from GPT-4 generated
5
60% 70% 80% 90% 100%12345BRanking Group 94% 624 : 66792% 614 : 67091% 623 : 68289% 597 : 66989% 605 : 67891% 609 : 666
>>>
based on the Transformer architecture [VSP+17] and trained on massive corpora of web-text data, using at its
core a self-super