# Process steps

1. send pdf base64
1. convert pdf base64 to pils
1. do ocr
1. sort line
1. translate text to english
1. do vector
1. start chat


### 1. send pdf base64


In [2]:
from pdf2image import convert_from_path
import os

In [3]:
file_path = "../upload_folder/product_a/electrolux_washing_machine.pdf"
# file_path = "../data_folder/product_b/lg_washing_machine.pdf"
# file_path = "../data_folder/product_a_small/electrolux_washing_machine.pdf"

# load the pdf to image
folder_path = os.path.dirname(file_path)
preprocessed_data_folder = folder_path.replace("upload_folder", "preprocessed_data")
# product id is folder_path folder name

product_id = os.path.basename(folder_path)
persist_directory = f"../db_folder/{product_id}"
persist_directory
# create folder if not exist

'../db_folder/product_a'

In [4]:
import pdfplumber


os.makedirs(preprocessed_data_folder, exist_ok=True)

image_name_list = []


def pdf_save_text_each_page(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            # Read all pages and extract text
            for ind, page in enumerate(pdf.pages):
                text = page.extract_text()
                # save text to image_{ind}.txt
                image_name = f"image_{ind}.txt"
                image_name_list.append(image_name)
                file_path = os.path.join(preprocessed_data_folder, image_name)
                with open(file_path, "w") as f:
                    f.write(text)

    except Exception as e:
        print("Error reading PDF:", e)


pdf_save_text_each_page(file_path)

In [4]:
import os

image_name_list = os.listdir(preprocessed_data_folder)
# .txt only

image_name_list = [i for i in image_name_list if i.endswith(".txt")]
import re


def extract_numeric_part(filename):
    # Use regular expression to extract the numeric part of the filename
    match = re.search(r"\d+", filename)
    if match:
        return int(match.group())
    return None


# Sort the file names based on the numeric part
image_name_list = sorted(image_name_list, key=extract_numeric_part)

print(image_name_list[:3])
print(image_name_list[-3:])

['image_0.txt', 'image_1.txt', 'image_2.txt']
['image_37.txt', 'image_38.txt', 'image_39.txt']


In [5]:
from transformers import GPT2TokenizerFast
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
image_name_list[:3]

['image_0.txt', 'image_1.txt', 'image_2.txt']

In [7]:
meta_data_list = []
text_list = []

for ind, image_name in enumerate(image_name_list):
    text_path = os.path.join(
        preprocessed_data_folder, image_name.split(".")[0] + ".txt"
    )

    with open(text_path, "r") as f:
        text = f.read()

    meta_data = {
        "page_index": ind + 1,
        "data_name": file_path.split("/")[-1],
    }
    # meta_data = {"hellow": ind+1}
    text_list.append(text)
    meta_data_list.append(meta_data)
# Step 3: Create function to count tokens
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))

In [16]:
# Step 4: Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=300,
    # chunk_size=2000,
    chunk_overlap=100,
    length_function=count_tokens,
)
chunks = text_splitter.create_documents(text_list, meta_data_list)

In [17]:
len(chunks)

135

In [18]:
from langchain.embeddings import OpenAIEmbeddings
import os
import openai
from dotenv import load_dotenv
import pandas as pd

credential_path = "../.credential"
load_dotenv(credential_path)

openai.api_key = os.environ["OPENAI_API_KEY"]

In [19]:
import chromadb
from langchain.vectorstores import Chroma
import time

embeddings = OpenAIEmbeddings()
db = None
try:
    assert os.path.exists(persist_directory) == True

    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    print("Load chroma index success")
except:
    db = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_directory,
    )
    # for ind, chunk in enumerate(chunks):
    #     if db is None:
    #         print("create new chroma index success")
    #         db = Chroma.from_documents(
    #             documents=[chunk],
    #             embedding=embeddings,
    #             persist_directory=persist_directory,
    #         )
    #     else:
    #         db.add_documents([chunk])
    #         time.sleep(0.3)

In [20]:
db.persist()

In [13]:
data = db.get()
data["embeddings"]

In [14]:
for id, doc, meta in zip(data["ids"], data["documents"], data["metadatas"]):
    print(id, meta["page_index"], meta["data_name"])
    break

45ee3f46-3cc4-11ee-b2d6-acde48001122 1 electrolux_washing_machine.pdf


In [1]:
result = {
    "question": "The maximum capacity of a washing machine is the largest amount of laundry it can hold.",
    "chat_history": [
        (
            "The dimension of the EWF9024P5WB model is not specified.",
            "The dimensions of the EWF9024P5WB model are 848 x 596.5 x 659 mm (Height / Width / Depth).",
        ),
        (
            "The dimension of the EWF9024P5WB model is not specified.",
            "The dimensions of the EWF9024P5WB model are 848 x 596.5 x 659 mm (Height / Width / Depth).",
        ),
        (
            "The dimension of the EWF9024P5WB model is not specified.",
            "The dimensions of the EWF9024P5WB model are 848 x 596.5 x 659 mm (Height / Width / Depth).",
        ),
        (
            "The maximum capacity of a washing machine is the largest amount of laundry it can hold.",
            "The maximum capacity of the washing machine depends on the model. For the EWF8024P5WB and EWF8024P5SB models, the maximum capacity is 4 kg. For the EWF9024P5WB model, the maximum capacity is 5.5 kg.",
        ),
    ],
    "answer": "The maximum capacity of the washing machine depends on the model. For the EWF8024P5WB and EWF8024P5SB models, the maximum capacity is 4 kg. For the EWF9024P5WB model, the maximum capacity is 5.5 kg.",
    "source_documents": "source",
    "generated_question": "What is the maximum capacity of a washing machine?",
}
# result =