## Importing Libraries

In [134]:
import os
import glob
from dotenv import load_dotenv
import numpy as np
import tiktoken
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader

In [135]:
MODEL = "gpt-4.1-nano"
db_name = 'vector_db'

In [136]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(openai_api_key[:8])
else:
    print('OpenAI API key not found.')

sk-proj-


## Examine the Documents

In [137]:
knowledge_base_path = "../knowledge-base/**/*.md"

entire_knowledge_base = ""

In [138]:
filenames = glob.glob(knowledge_base_path)
print(f'The total no of files are {len(filenames)}')

for filename in filenames:
    with open(filename, 'r',encoding = 'utf-8') as f:
        entire_knowledge_base+= f.read()
        entire_knowledge_base += '\n\n'

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}")

The total no of files are 76
Total characters in knowledge base: 304,434


In [140]:
filenames = glob.glob(knowledge_base_path)
print(f'The total no of files are {len(filenames)}')
extra = {}
for filename in filenames:
    name = Path(filename).stem
    with open(filename, 'r',encoding = 'utf-8') as f:
        extra[name.lower()] = f.read()
all_text_combined = '\n\n'.join(extra.values()) + '\n\n'

print(f"Total characters: {len(all_text_combined):,}")

The total no of files are 76
Total characters: 304,434


## Count the number of Tokens

In [142]:
encoding = tiktoken.encoding_for_model(MODEL)
tokens = encoding.encode(entire_knowledge_base)
print(f"The total number of tokens used are: {len(tokens):,}")

The total number of tokens used are: 63,555


## LangChain Document Object

In [191]:
folders = glob.glob('../knowledge-base/*')

documents = []

for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob = '**/*.md', loader_cls = TextLoader, loader_kwargs = {'encoding':'utf-8'})
    folder_docs = loader.load()

    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)
        
print(f"Loaded {len(documents)} documents")

Loaded 76 documents


## Split each Documents into Chunks

In [195]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap= 200)
chunks = text_splitter.split_documents(documents)

print(chunks[1])

page_content='### Seamless Integrations
Rellm's architecture is designed for effortless integration with existing systems. Whether it's policy management, claims processing, or financial reporting, Rellm connects seamlessly with diverse data sources to create a unified ecosystem.

### Risk Assessment Module
The comprehensive risk assessment module within Rellm allows insurers to evaluate risk profiles accurately. By leveraging historical data and advanced modeling techniques, Rellm provides a clear picture of potential liabilities and expected outcomes.

### Customizable Dashboard
Rellm features a customizable dashboard that presents key metrics and performance indicators in an intuitive interface. Users can tailor their view to focus on what matters most to their business, enhancing user experience and productivity.' metadata={'source': '../knowledge-base/products/Rellm.md', 'doc_type': 'products'}
