##  Setup up

In [1]:
import os
import openai
from pathlib import Path
from pprint import pprint
import ray
from tqdm import tqdm

In [2]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()

True

In [3]:
EFS_DIR = Path("/efs/shared_storage/simon")
ROOT_DIR = Path(os.getcwd()).parent
print (ROOT_DIR)

/home/ray/default/llm-applications


In [4]:
# Credentials
ray.init(runtime_env={"env_vars": {
    "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
    "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    "ANYSCALE_API_BASE": os.environ["ANYSCALE_API_BASE"],
    "ANYSCALE_API_KEY": os.environ["ANYSCALE_API_KEY"],
    "DB_CONNECTION_STRING": os.environ["DB_CONNECTION_STRING"],
}})

2023-08-29 18:51:20,386	INFO worker.py:1431 -- Connecting to existing Ray cluster at address: 10.0.30.102:6379...
2023-08-29 18:51:20,395	INFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com [39m[22m
2023-08-29 18:51:20,399	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_0724499099e9464ae2421a5de11aa12b.zip' (0.89MiB) to Ray cluster...
2023-08-29 18:51:20,401	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_0724499099e9464ae2421a5de11aa12b.zip'.


0,1
Python version:,3.9.15
Ray version:,2.6.2
Dashboard:,http://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com


## Load data

Our data is already ready at `/efs/shared_storage/goku/docs.ray.io/en/master/` (on Staging, `us-east-1`) but if you wanted to load it yourself, run this bash command (change `/desired/output/directory`, but make sure it's on the shared storage,
so that it's accessible to the workers):
```bash
export DOCS_PATH=/efs/shared_storage/simon/docs.ray.io/en/master/
wget -e robots=off --recursive --no-clobber --page-requisites \
  --html-extension --convert-links --restrict-file-names=windows \
  --domains docs.ray.io --no-parent --accept=html \
  -P $DOCS_PATH https://docs.ray.io/en/master/
```

In [5]:
# Ray dataset
docs_path = Path(EFS_DIR, "docs.ray.io/en/master/")
ds = ray.data.from_items([{"path": path} for path in docs_path.rglob("*.html") if not path.is_dir()])
print(f"{ds.count()} documents")

3266 documents


## Process data

In [6]:
from bs4 import BeautifulSoup, NavigableString, Tag
import matplotlib.pyplot as plt
import pandas as pd

In [7]:
def load_html_file(path):
    with open(path) as f:
        soup = BeautifulSoup(f.read())
    html_tags = [
        ("div", {"role": "main"}),
        ("main", {"id": "main-content"}),
    ]
    text = None
    for tag, attrs in html_tags:
        text = soup.find(tag, attrs)
        # if found, break
        if text is not None:
            break

    return text

In [8]:
class TaggedStr:
    def __init__(self, value, tag):
        self.value = value
        self.tag = tag

    def __repr__(self):
        return repr(self.value) + f" [{self.tag}]" if self.tag else ""

In [9]:
def convert_to_tagged_text(path, element, section=None):
    "Recursively convert a BeautifulSoup element to text, keeping track of sections."
    results = []
    for child in element.children:
        if isinstance(child, NavigableString):
            results.append(TaggedStr(str(child), section))
        elif isinstance(child, Tag):
            if child.name == "section" and "id" in child.attrs:
                results.extend(convert_to_tagged_text(path, child, section=child.attrs["id"]))
            elif not child.find_all("section"):
                results.append(TaggedStr(child.get_text(), section))
            else:
                results.extend(convert_to_tagged_text(path, child, section))
    return results

In [10]:
def group_tagged_text(chunks):
    result = []
    for item in chunks:
        if result and item.value.strip() == "":
            result[-1].value += item.value
        elif result and item.tag == result[-1].tag:
            result[-1].value += item.value
        else:
            result.append(item)
    return result

In [11]:
def path_to_uri(path, scheme="https://", domain="docs.ray.io"):
    return scheme + domain + path.split(domain)[-1]

In [12]:
def parse_file(record):
    html_content = load_html_file(record["path"])
    if not html_content:
        return []
    parsed_data = [
        {
            "source": path_to_uri(str(record["path"])) + ("#" + chunk.tag if chunk.tag else ""),
            "text": chunk.value,
        }
        for chunk in group_tagged_text(convert_to_tagged_text(record["path"], html_content))
    ]
    return parsed_data

In [13]:
# Extract sections
sections_ds = ds.flat_map(parse_file)
sections = sections_ds.take_all()
print (len(sections))

2023-08-29 18:51:24,069	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(parse_file)]
2023-08-29 18:51:24,070	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-08-29 18:51:24,071	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

[2m[1m[36m(autoscaler +9s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[2m[1m[36m(autoscaler +9s)[0m Adding 1 node(s) of type worker-node-type-0.
8944


### Subsample data

In [25]:
import random 

In [26]:
SAMPLING_RATIO = 0.05
sampling_percentage = SAMPLING_RATIO * 100 
n_samples = int(SAMPLING_RATIO * len(sections))

In [27]:
val_corpus = random.sample(sections, n_samples)

In [28]:
print(f'Sampled {sampling_percentage}% of full corpus '
      f'with {len(sections)} sections, got {len(val_corpus)} sections')

Sampled 5.0% of full corpus with 8944 sections, got 447 sections


### Generate synthetic evaluation data

In [31]:
from llama_index.schema import Document
from llama_index.finetuning import generate_qa_embedding_pairs

In [33]:
val_nodes = [
    Document(text=section['text'], metadata={'source': section['source']})
    for section in val_corpus
]

In [38]:
os.environ["OPENAI_API_KEY"]

'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'

In [37]:
datataset = generate_qa_embedding_pairs(val_nodes)

  0%|          | 0/447 [00:00<?, ?it/s]


AuthenticationError: No API key provided. You can set your API key in code using 'openai.api_key = <API-KEY>', or you can set the environment variable OPENAI_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the openai module at it with 'openai.api_key_path = <PATH>'. You can generate API keys in the OpenAI web interface. See https://platform.openai.com/account/api-keys for details.