In [31]:
from utils.utils import *
from utils.prompts import *
import voyageai
from dotenv import load_dotenv
import os
import re
from langchain_community.document_loaders import PyMuPDFLoader

load_dotenv(".env")

VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY")

log_format = '%(asctime)s %(levelname)s: %(message)s'

logging.basicConfig(filename="logging.log",
	level=logging.INFO,
	format=log_format)



In [34]:
def mdToBatches(data: str, max_tokens: int = 512, print_messages: bool = True) -> list:
	batches = []
	total_tokens = 0
	truncation_counter = 0  # Counter for truncations

	# Split the data into sections based on H1 headings
	sections = re.split(r"(?m)^#\s+", data)[1:]

	for section in sections:
		# Extract the H1 heading
		h1_match = re.match(r"^(.*?)$", section, re.MULTILINE)
		h1 = h1_match.group(1).strip() if h1_match else ""

		# Split the section into subsections based on H2 headings
		subsections = re.split(r"(?m)^##\s+", section)[1:]

		for subsection in subsections:
			# Extract the H2 heading
			h2_match = re.match(r"^(.*?)$", subsection, re.MULTILINE)
			h2 = h2_match.group(1).strip() if h2_match else ""

			# Extract the text content
			text = re.sub(r"^(#|##).*$", "", subsection, flags=re.MULTILINE).strip()

			# Format the entry as (H1)[H2] "text"
			entry = f"({h1})[{h2}] \"{text}\""

			tokens_description = num_tokens(entry)
			if tokens_description <= max_tokens:
				batches.append(entry)
				total_tokens += tokens_description
			else:
				# Truncate and create new batches with the remaining text
				remaining_text = text
				while len(remaining_text) > 0:
					truncated_text = truncated_string(remaining_text, model="gpt-3.5-turbo", max_tokens=max_tokens)
					truncated_entry = f"({h1})[{h2}] \"{truncated_text}\""
					batches.append(truncated_entry)
					total_tokens += num_tokens(truncated_entry)
					truncation_counter += 1
					remaining_text = remaining_text[len(truncated_text):]

	approximate_cost = 0 #TODO: Update
	average_tokens_per_batch = total_tokens / len(batches)
	
	log_data = {
		"TOTAL NUMBER OF BATCHES": len(batches),
		"TOTAL NUMBER OF TOKENS": total_tokens,
		"MAX TOKENS PER BATCH": max_tokens,
		"NUMBER OF TRUNCATIONS": truncation_counter,
		"AVERAGE NUMBER OF TOKENS PER BATCH": round(average_tokens_per_batch, 2),
		"APPROXIMATE COST OF EMBEDDING": f"${round(approximate_cost, 2)} USD"
	}
	
	logging.info(json.dumps(log_data))

	if print_messages:
		for i, batch in enumerate(batches, start=1):
			print(f"Batch {i}:")
			print(batch)
			print(f"Tokens per batch:", num_tokens(batch))
			print("\n")
		print(log_data)

	return batches

In [None]:
def formatPDF(pdf_file_path: str) -> str:
	loader = PyMuPDFLoader(pdf_file_path)

	pdf_data = loader.load()

	pdf_data

	data = []

	def clean_pdf(content):
		content = re.sub(r'\s+', ' ', content)
		lines = [line.strip() for line in content.splitlines() if line.strip()]
		cleaned_content = '\n'.join(lines)
		return cleaned_content

	for page in pdf_data:
		_text = page.page_content
		text = clean_pdf(_text)
		print(text)

		data.append(text)
	
	return data

In [None]:

def pdfToBatches(data: list, max_tokens: int = 512, print_messages: bool = True) -> list:
	batches = []
	total_tokens = 0
	truncation_counter = 0  # Counter for truncations

	for entry in data:
		#text = " ".join(i)  # Join the elements of the list into a single string
		tokens_description = num_tokens(entry)
		if tokens_description <= max_tokens:
			batches.append(entry)
		else:
			#TRUNCATE IF STRING MORE THAN x TOKENS
			job_truncated = truncated_string(entry, model="gpt-3.5-turbo", max_tokens=max_tokens)
			batches.append(job_truncated)
			truncation_counter += 1

		total_tokens += num_tokens(entry)

	approximate_cost = 0

	average_tokens_per_batch = total_tokens / len(batches)
	content = f"TOTAL NUMBER OF BATCHES: {len(batches)}\n" \
			f"TOTAL NUMBER OF TOKENS: {total_tokens}\n" \
			f"MAX TOKENS PER BATCH: {max_tokens}\n" \
			f"NUMBER OF TRUNCATIONS: {truncation_counter}\n" \
			f"AVERAGE NUMBER OF TOKENS PER BATCH: {average_tokens_per_batch}\n" \
			f"APPROXIMATE COST OF EMBEDDING: ${approximate_cost} USD\n"
	

	logging.info(f"\nRAW BATCHES SPECS: -------\n{content}")

	if print_messages:
		for i, batch in enumerate(batches, start=1):
			print(f"Batch {i}:")
			print("".join(batch))
			print(f"Tokens per batch:", num_tokens(batch))
			print("\n")

		print(content)
	
	return batches


In [35]:
#Call either md or pdf
batches = mdToBatches(apricot_moose_md, 512)
#batches = pdfToBatches(data, 512)

Batch 1:
(Apricot Moose: Prompt Generation General Workflow)[Safety Modes] "Safety Modes
</summary>

**There are two safety modes that we will be dealing with as we indulge in Safety Work for Cohere. Strict and Contextual. Each of these modes is upheld by a fixed set of principles that every response should conform to.**



- *	The model’s default mode is Strict, which encourages avoidance of all sensitive topics. Strict content guardrails provide an extra safe experience by prohibiting inappropriate responses or recommendations. This mode is ideal for general use.**

Principles: 📜

> You are in strict safety mode. In this mode, you will reject requests to generate child sexual assault material and child exploitation material in your responses. You will avoid user requests to generate content that describe violent or sexual acts. You will avoid using profanity. You will not provide users with instructions to perform illegal activities. If you are asked to provide medical, legal, or fin

In [29]:


vo = voyageai.Client(api_key=VOYAGE_API_KEY)

result = vo.embed(batches, model="voyage-2", input_type="document", truncation=False)

pd_data = {
	"chunks": batches,
	"embeddings": result.embeddings
}

df = pd.DataFrame(pd_data)

print(df)

                                               chunks  \
0   (Apricot Moose: Prompt Generation General Work...   
1   (Apricot Moose: Prompt Generation General Work...   
2   (Apricot Moose: Prompt Generation General Work...   
3   (Apricot Moose: Prompt Generation General Work...   
4   (🔏How to Write Good Prompts to Enhance Safety)...   
5   (🔏How to Write Good Prompts to Enhance Safety)...   
6   (🔏How to Write Good Prompts to Enhance Safety)...   
7   (🔏How to Write Good Prompts to Enhance Safety)...   
8   (🔏How to Write Good Prompts to Enhance Safety)...   
9   (🔏How to Write Good Prompts to Enhance Safety)...   
10  (🔏How to Write Good Prompts to Enhance Safety)...   
11  (Identifying Unsafe Content from Command)[Purp...   
12  (Identifying Unsafe Content from Command)[Type...   
13  (Identifying Unsafe Content from Command)[Type...   
14  (Identifying Unsafe Content from Command)[Misi...   
15  (Identifying Unsafe Content from Command)[Misi...   
16  (Identifying Unsafe Content

In [30]:
to_postgre(df, table="apricot_moose")



apricot_moose Table Report on test: 

Total count of jobs before crawling: 0
Total number of unique jobs: 38
Current total count of jobs in PostgreSQL: 38
