In [1]:
import os
from dotenv import load_dotenv, find_dotenv
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredHTMLLoader


  from tqdm.autonotebook import tqdm


In [2]:
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') 

# configure client
pc = Pinecone(api_key=api_key)


In [3]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [4]:
import time

index_name = 'pilot2'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [5]:
import os
from bs4 import BeautifulSoup
import re

def extract_text_and_url(file_path):
    """ Extracts text and the first URL found in the HTML file. """
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        
        # Remove script and style elements
        for script_or_style in soup(["script", "style"]):
            script_or_style.decompose()

        # Get main text
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
 

        pattern = r"source:\s*(https?://[^\s]+)"
        # Extract URL
        url_match = re.search(pattern, text)
        url = url_match.group(1) if url_match else ""
        # Regex to remove URL from the text
        text = re.sub(pattern, "", text).strip()


        return text, url

def read_html_files_and_store(directory):
    """ Read HTML files from a directory and store information in the specified format. """
    data = []
    html_files = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.html')]
    
    for file_path in html_files:
        text, url = extract_text_and_url(file_path)
        data.append({
            'id': os.path.basename(file_path),
            'source': url if url else "",
            'text': text
        })
    
    return data



In [6]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)
text_splitter = RecursiveCharacterTextSplitter(
    # chunk_size=5000,
    chunk_overlap=0,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [7]:
from langchain_openai import OpenAIEmbeddings
model_name = 'text-embedding-ada-002'
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") 
embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=os.environ['OPENAI_API_KEY']
)

In [8]:
from tqdm.auto import tqdm
from uuid import uuid4
from langchain.text_splitter import RecursiveCharacterTextSplitter

batch_limit = 100

texts = []
metadatas = []
data = read_html_files_and_store(r"C:\Users\Fiberta\Documents\OMC training data\All Combined")
for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['source'],
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

100%|██████████| 2262/2262 [01:17<00:00, 29.20it/s]


# conversation with chatbot


In [9]:
# from langchain.vectorstores import Pinecone

# text_field = "text"  # the metadata field that contains our text

# # initialize the vector store object
# vectorstore = Pinecone(
#     index, embed.embed_query, text_field
# )

In [10]:
# from langchain_openai import ChatOpenAI
# from langchain.chains import RetrievalQA

# # completion llm
# llm = ChatOpenAI(
#     openai_api_key=os.getenv("OPENAI_API_KEY"),
#     model_name='gpt-3.5-turbo',
#     temperature=0.0
# )


In [11]:
# from langchain.chains import RetrievalQAWithSourcesChain

# qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=vectorstore.as_retriever()
# )

In [12]:
# query = "Who is Bill Gates?"

# qa_with_sources.invoke(query)


In [13]:
# pc.delete_index(index_name)