In [1]:
import os
import shutil
import warnings
import polars as pl
import lance

from datasets import Dataset, load_dataset, IterableDataset

In [2]:
Q_SEP = "Q:\n\n"
A_SEP = "\n\nA:\n\n"

def process_qna(series: pl.Series):
    return series.str.strip_prefix(Q_SEP).str.splitn(A_SEP, 3).struct.rename_fields(["question", "answer", "_"]).struct.unnest().drop("_")



STREAM_BATCH_BYTES = 2 ** 27 # ~128MB

def process_batch(path: str, batch_iter, i: int):
    print(f"Processing Batch: {i}")
    batch = next(batch_iter)
    pl_series = pl.Series("text", batch["text"])
    pl_final = process_qna(pl_series).with_columns(pl.Series("meta", batch["meta"]))
    print(f"Writing Batch: {i}")
    lance.write_dataset(pl_final.to_arrow(), "".join([path, f"_batch_{i}"]) if i > 0 else path)

def merge_batches(path: str, batch_count: int):
    base_dataset = lance.dataset(path)
    for i in range(1, batch_count):
        print(f"Appending Batch: {i}")
        aux_path = "".join([path, f"_batch_{i}"])
        aux_dataset = lance.dataset(aux_path)
        base_dataset.insert(aux_dataset)
        shutil.rmtree(aux_path)
    

def get_lance_dataset(path: str, batch_limit: int | None=None, force_download=False):
    """
    **Parameters**\n
    path: str - make sure to make it an absolute path\n
    batch_limit: int - (optional) if included, only up to x batches, else downloads whole dataset\n
    force_download: bool - whether to redownload the dataset if one exists

    **Returns**\n
    -> LanceDataset - Lance dataset (folder)
    """
    if os.path.exists(path) and not force_download:
        base_dataset = lance.dataset(path)

    else:
        ds = load_dataset("bigscience-data/roots_code_stackexchange", streaming=True)["train"]
        train_info = ds.info.splits["train"]
        stream_batch_size = int(STREAM_BATCH_BYTES * (train_info.num_examples / train_info.num_bytes))
        batch_iter = iter(ds.batch(stream_batch_size))

        if batch_limit is not None:
            c = None
            for i in range(batch_limit):
                try:
                    process_batch(path, batch_iter, i)
                except Exception as e:
                    c = i
                    if e is not StopIteration:
                        warnings.warn(f"Caught exception in iterator: {e}")
                    break
            
            c = c or batch_limit

        else:
            c = 0
            while True:
                try:
                    process_batch(path, batch_iter, i)
                    c += 1
                except Exception as e:
                    if e is not StopIteration:
                        warnings.warn(f"Caught exception in iterator: {e}")
                    break

        print(f"Processed Batches: {c}")
        assert c > 0, f"Empty or error-ridden dataset: {path}"

        print(f"\nSaving Dataset...")
        base_dataset = lance.dataset(path)
        if c > 1:
            merge_batches(path, c)

        print("Saved Dataset")

    return base_dataset

db_name = "stackexchange_base_db_lance"
lance_dataset = get_lance_dataset(os.path.sep.join([os.getcwd(), "cache", db_name]), 5)

In [9]:
LOAD_BATCH_SIZE = 2 ** 16

lance_batches = lance_dataset.to_batches(batch_size=LOAD_BATCH_SIZE)

In [12]:
batch = next(iter(lance_batches))
batch

pyarrow.RecordBatch
question: large_string
answer: large_string
meta: large_string
----
question: ["Does image on the page improves SEO rankings?

My friend once told that I should include big image on the main page of my website. And it will be good for SEO.
So does image on the page improves SEO rankings?","What happens with WHO IS privacy protection when the domain is not active?

I'm running small Wordpress site (about 15k visitors per month) and I want to buy a custom domain and monetize it. During the registration I'm asked for personal info, I know I have to provide it. I also want to buy WHO IS privacy protection, but I have a question - it's for one year, the domain is too. What happens if I won't pay for my domain after one year - will all my personal information be visible, or will it dissapear cause the domain is no longer active?
Thanks for your answers. ","Using established domain names and phrases in your sentences as a keywords to boost your rank?

Let's imagine a situa

In [15]:
pl_batch = pl.from_arrow(batch)
pl_batch.head()

question,answer,meta
str,str,str
"""Does image on the page improve…","""Yes and no. It is content that…","""{'file': 'webmasters.stackexch…"
"""What happens with WHO IS priva…","""The short answer is that it de…","""{'file': 'webmasters.stackexch…"
"""Using established domain names…","""There are a few things you sho…","""{'file': 'webmasters.stackexch…"
"""PostmarkApp inbound domain cre…","""You need to add the MX record …","""{'file': 'webmasters.stackexch…"
"""does google webmaster track cl…","""Yes you can track these click …","""{'file': 'webmasters.stackexch…"


In [24]:
pl_batch["answer"].to_list()

["Yes and no.\nIt is content that ranks and image tags offer some to the total performance of the page, however, it is limited.\nWhere your friend is right is engagement. Google looks for elements that offer engagement. An image is the most basic engagement there is. At one point, Google was not shy about saying that an image should exist at the top of the content. SEO's agreed for several reasons and so for almost every page you will see on the web, there is an image at the top of the content.\nSo besides the value of the alt text and any potential that the path and file name give, Google credits the image as engagement. However, this too is limited.\nFrom a user experience (UX) perspective, it does cause a pause in the users visit and may entice the user to read your content. It is just one element, however, it is the UX effect that is the greatest. It can extend the time spent on page and reduce bounce rates somewhat which are other metrics Google can use to rank a page.",
 'The sho

In [None]:
import torch
from sentence_transformers import SentenceTransformer


In [None]:
SentenceTransformer("").encode()