In [3]:
import os
from dotenv import load_dotenv


load_dotenv()


OPENAI_API_TYPE = os.getenv("OPENAI_API_TYPE")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
VECTOR_STORE_ADDRESS = os.getenv("VECTOR_STORE_ADDRESS")
VECTOR_STORE_KEY = os.getenv("VECTOR_STORE_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")



### Load Websites

In [None]:
import json
import glob

manifest = "/Users/reespawson/Documents/Playground/llm_consulting/advancer/POC/fair_work_site/manifest_dump.json"
html_files = glob.glob("/fairwork_site/html/*.html")

with open(manifest, 'r') as f:
    manifest = json.load(f)



In [None]:
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
)


def search_manifest_by_filename(filename):
    print(filename)
    for entry in manifest:
        if entry['filename'] == filename:
            return entry['url']

    raise ValueError('No entry found for filename: {}'.format(filename))


all_documents = []
for filename in html_files:

    # now you can use your UnstructuredHTMLLoader with the new HTML file
    loader = UnstructuredHTMLLoader(filename)

    # load the data
    data = loader.load()
    
    # add metadata
    site_url = search_manifest_by_filename('html/' + os.path.basename(filename))
    data[0].metadata['site_url'] = site_url

    # split text

    all_documents.extend(data)
    # print(f"Loaded {len(texts)} documents from {filename}")



split_websites = text_splitter.split_documents(all_documents)


In [None]:
print(len(split_websites))

In [None]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch



model: str = "text-embedding-ada-002"
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, chunk_size=1)

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=VECTOR_STORE_ADDRESS,
    azure_search_key=VECTOR_STORE_KEY,
    index_name=INDEX_NAME,
    embedding_function=embeddings.embed_query
)

vector_store.add_documents(split_websites)

### Load csvs

In [5]:
import glob
csv_files = glob.glob('docx/*.csv')

In [None]:
from langchain.schema.document import Document
import csv

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
)


all_csv_documents = []
for csv_file in csv_files:

    with open(csv_file, 'r') as infile:
        reader = csv.DictReader(infile)
        data = list(reader)

    for row in data:
        # print(row)  # Now 'row' is a dictionary with keys as column names
        
        doc = Document(
            page_content=row['text'],
            metadata={"legislation": row['legislation'], "part": row['part'], "division": row['division']}
        )

        all_csv_documents.append(doc)


In [None]:
split_docs = text_splitter.split_documents(all_csv_documents)

In [None]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch


model: str = "text-embedding-ada-002"
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, chunk_size=1)

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=VECTOR_STORE_ADDRESS,
    azure_search_key=VECTOR_STORE_KEY,
    index_name=INDEX_NAME,
    embedding_function=embeddings.embed_query
)

vector_store.add_documents(split_docs)