In [10]:
from DBHandler import DBHandler

import google.generativeai as genai

# semantic chunking imports
from semantic_router.splitters import RollingWindowSplitter
from semantic_router.encoders import CohereEncoder, OpenAIEncoder
from semantic_router.utils.logger import logger

logger.setLevel("WARNING")  # reduce logs from splitter
import warnings
warnings.filterwarnings("ignore")

from typing import Union

import os
from dotenv import load_dotenv
load_dotenv()


True

In [2]:
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
handler = DBHandler('maccabi')


In [50]:
def semantic_chunking(encoder: Union[type(CohereEncoder), type(OpenAIEncoder())], directory_path: str, score_threshold: float = 0.5) -> list:
	"""
    Use the semantic chunking to split the documents into semantic chunks
    Args:
        encoder: an embedding model to use for the semantic chunking
        directory_path (str): path to the directory containing the documents
        score_threshold (float): the score threshold for the encoder below which the split is made, between 0 and 1
    Returns:
        splits (list): list of the semantic chunks
    """
	encoder.score_threshold = score_threshold
	splitter = RollingWindowSplitter(
		# Todo: adjust the parameters according to the dataset
		encoder=encoder,
		dynamic_threshold=False,
		min_split_tokens=100,
		max_split_tokens=400,
		window_size=5,
		plot_splits=True,
		enable_statistics=True
	)

	splits = []
	for file_name in os.listdir(directory_path):
		print(file_name)
		file = open(f'{directory_path}/{file_name}', "r")
		example_faq = file.read()
		file.close()

		current_splits = splitter([example_faq])
		complete_current_splits = []

		for i in range(len(current_splits)):
			# for more context, add 200 chars from the previous and next splits
			if i == 0:
				complete_current_splits.append(' '.join(current_splits[i].docs + current_splits[i + 1].docs[:200]))
			elif i + 1 == len(current_splits):
				complete_current_splits.append(' '.join(current_splits[i - 1].docs[-200:] + current_splits[i].docs))
			else:
				complete_current_splits.append(' '.join(
					current_splits[i - 1].docs[-200:] + current_splits[i].docs + current_splits[i + 1].docs[:200]))

		splits.extend(complete_current_splits)
	return splits


def google_embedding(text: str, model: str = 'models/text-embedding-004') -> list:
	"""
	Use the Google Embedding API to embed the text
	Args:
		text (str): the text to embed
		model (str): the name of the model to use for the embedding, either 'models/text-embedding-004' or 'models/embedding-001'
	Returns:
		embedding (list): the embedding vector of the text
	Raises:
		Exception: if there is an error in embedding the text
	"""
	try:
		embedding = genai.embed_content(model=model, content=text, task_type='retrieval_document')
	except Exception as e:
		raise Exception(f'Error in embedding the text: {e}')
		
	return embedding['embedding']


def create_chunks(encoder: Union[type(CohereEncoder), type(OpenAIEncoder())], directory_path: str, score_threshold: float = 0.5, model: str = 'models/text-embedding-004') -> list:
	"""
	Create the chunks of the documents and embed them
	Args:
		encoder: an embedding model to use for the semantic chunking
		directory_path (str): path to the directory containing the documents
		score_threshold (float): the score threshold for the encoder below which the split is made, between 0 and 1
		model (str): the name of the model to use for the embedding, either 'models/text-embedding-004' or 'models/embedding-001'
	Returns:
		chunks (list): list of the chunks with their embeddings
	"""
	splits = semantic_chunking(encoder, directory_path, score_threshold)
	
	chunks = []
	for split in splits:
		embedding = google_embedding(split, model)
		chunk = {
			'text': split,
			'embedding': embedding
		}
		chunks.append(chunk)
		
	return chunks

# chunks = create_chunks(CohereEncoder(), 'testing_files', 0.5, 'models/text-embedding-004')
# handler.update('embeddings', chunks)