In [2]:
from datasets import load_dataset

dataset = load_dataset("quora")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd


def flatten_data(dataset):
    for pairs in dataset["train"]["questions"]:
        for id, text in zip(pairs["id"], pairs["text"]):
            yield {"id": id, "text": text}


df = pd.DataFrame(flatten_data(dataset))
print(f"Found {len(df)} training examples")
df.drop_duplicates(subset="id", inplace=True)
print(f"Found {len(df)} unique training examples")
df.to_csv("quora_train.csv", index=False)

Found 808580 training examples
Found 537933 unique training examples


In [4]:
# This should be the set of texts we should just pass and tune embedding:
df.head()

Unnamed: 0,id,text
0,1,What is the step by step guide to invest in sh...
1,2,What is the step by step guide to invest in sh...
2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,4,What would happen if the Indian government sto...
4,5,How can I increase the speed of my internet co...


In [7]:
import chunk
import itertools
from openai import AsyncOpenAI, OpenAI
from typing import AsyncGenerator, List, Literal, Tuple
import asyncio


def batched(iterable, n=1):
    """
    Yields batches of size n from iterable
    """
    it = iter(iterable)
    while True:
        chunk = list(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk


class Embedder:
    @classmethod
    def batch_text(cls, texts: List[Tuple[int, str]], batch_size: int = 32):
        for batch in batched(texts, batch_size):
            yield batch

    @classmethod
    async def embed_openai(
        cls,
        chunks: List[Tuple[int, str]],
        model: Literal[
            "text-embedding-3-small", "text-embedding-3-large"
        ] = "text-embedding-3-small",
    ):
        client = AsyncOpenAI()
        sem = asyncio.Semaphore(32)

        async def fetch_embedding(idz: int, text_str: str):
            async with sem:
                response = await client.embeddings.create(input=text_str, model=model)
                return (idz, response.data[0].embedding)

        results = await asyncio.gather(*[
                fetch_embedding(idz, text_str)
                for (idz, text_str) in chunks
            ])
        return results


In [10]:
sample = df.sample(100)
sample_tuples = zip(sample["id"], sample["text"])

results = await Embedder.embed_openai(
    chunks=sample_tuples, model="text-embedding-3-small"
)

pd.DataFrame(results, columns=["id", "embedding"]).to_csv("quora_train_embeddings.csv", index=False)

In [11]:
pd.read_csv("quora_train_embeddings.csv")

Unnamed: 0,id,embedding
0,424100,"[0.013506991788744926, 0.0046718236990273, 0.0..."
1,151107,"[0.013491247780621052, -0.024952838197350502, ..."
2,18093,"[-0.011101662181317806, -0.012250323779881, 0...."
3,39252,"[-0.03395606949925423, -0.002453284105286002, ..."
4,162055,"[0.01155170425772667, -0.010641014203429222, -..."
...,...,...
95,88271,"[0.023234102874994278, -0.04114900156855583, -..."
96,486676,"[0.0038834703154861927, -0.017274843528866768,..."
97,215334,"[0.05282963812351227, -0.0063212220557034016, ..."
98,394546,"[0.012760530225932598, 0.04238694906234741, 0...."
