# Generate QnA synthetic dataset from a PDF - Image-heavy PDF 

## 1. Read & Preprocess CSV file
---
### Split PDF into individual pages

In [None]:
# import fitz
# raw_data_dir = "../raw_data"

# file_path = f"{raw_data_dir}/pdf/img-advertising-generated-by-ai.pdf"

# # Open the first PDF document
# doc1 = fitz.open(file_path)
# #split_pages = [(4, 122), (4, 194)]
# split_pages = [(1, 5)]
# for idx, s in enumerate(split_pages):
#     # Create a new empty PDF document
#     doc2 = fitz.open()

#     # Insert the first 2 pages of doc1 into doc2
#     doc2.insert_pdf(doc1, from_page=s[0], to_page=s[1])

#     # Save the modified document
#     doc2.save(f"{raw_data_dir}/part{idx}.pdf")


In [None]:
import time
import os, shutil, random
from dotenv import load_dotenv
from langchain_community.document_loaders.csv_loader import CSVLoader
from util.preprocess import remove_short_sentences, remove_small_images
from util.common_utils import get_language_code

load_dotenv()

image_dir = "./image"
raw_data_dir = "../raw_data"

if os.path.isdir(image_dir): shutil.rmtree(image_dir)
os.makedirs(image_dir, exist_ok=True)

DOMAIN = "Advertising"
LANGUAGE = "English" # You can change your language here. e.g., "Korean", "Japanese", "Chinese"
LANGUAGE_CODE = get_language_code(LANGUAGE)
print(f"Domain: {DOMAIN}, Language: {LANGUAGE}, Language Code: {LANGUAGE_CODE}")

### Preprocess PDF file (image part)

In [None]:
import fitz
from glob import glob

file_path = f"{raw_data_dir}/pdf/img-advertising-generated-by-ai.pdf"

doc = fitz.open(file_path)
clip_x, clip_y = 30, 30

for i, page in enumerate(doc):
    x, y, w, h = page.rect
    clip = fitz.Rect(x+clip_x, y+clip_y, w-clip_x, h-clip_y)
    page.set_cropbox(clip)
    pix = page.get_pixmap()
    pix.save(f"{image_dir}/page_{i:03d}.jpg")

images = sorted(glob(os.path.join(image_dir, "*.jpg")))
max_tokens = 1024

In [None]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

from langchain_openai import AzureChatOpenAI
llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

human_prompt_main = f"Given image, give a concise summary in {LANGUAGE}. Don't insert any XML tag such as <text> and </text> when answering."

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": human_prompt_main
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()

In [None]:
%%time
from util.preprocess import encode_image_base64
#images = glob(os.path.join(image_dir, "*.jpg"))
base64_images = [encode_image_base64(img_path) for img_path in images]
image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 8})
image_summaries = remove_short_sentences(image_summaries)

In [None]:
image_summaries[:3]

### Preprocess PDF file (text part)

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(file_path)
docs = loader.load()

In [None]:
import re
import tiktoken
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

tokenizer = tiktoken.get_encoding('o200k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1024,
    chunk_overlap=100,
    length_function=tiktoken_len,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],    
)

# split_docs = text_splitter.split_documents(docs)
# print(f'Number of splitted docs: {len(split_docs)}')

a = [re.sub(' +', ' ', doc.page_content) for doc in docs]
joined_docs = '\n\n'.join(a)

split_docs = text_splitter.split_text(joined_docs)
print(f'Number of splitted docs: {len(split_docs)}')

## 2. Construct QnA Pairs
---

In [None]:
from util.qa import CustomQADataGenerator
model_config = {
    "deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    "model": "gpt-4o",
    "max_tokens": 1024,
}

qa_generator = CustomQADataGenerator(model_config=model_config, templates_dir=f"./prompt_template/{LANGUAGE_CODE}")

In [None]:
import asyncio
from collections import Counter
from typing import Dict
import os
from azure.ai.generative.synthetic.qa import QAType
concurrency = 6  # number of concurrent calls
sem = asyncio.Semaphore(concurrency)

#qa_type = QAType.CONVERSATION
qa_type = QAType.LONG_ANSWER

async def generate_async(text: str) -> Dict:
    async with sem:
        return await qa_generator.generate_async(
            text=text,
            qa_type=qa_type,
            num_questions=3,  # Number of questions to generate per text
        )

In [None]:
t0 = time.time()
# Process only some samples for debug purposes
input_batch = image_summaries + split_docs
results = await asyncio.gather(*[generate_async(text) for text in input_batch], return_exceptions=True)

question_answer_list = []
token_usage = Counter()
for result in results:
    if isinstance(result, Exception):
        raise result  # exception raised inside generate_async()
    question_answer_list.append(result["question_answers"])
    token_usage += result["token_usage"]

t1 = time.time()
print(f"Successfully generated QA. Generating took {t1 - t0:.4f} seconds.")    

## 3. Save to jsonl for fine-tuning
---

In [None]:
question_answer_list[:2]

In [None]:
import json
from util.common_utils import convert_to_oai_format, save_jsonl

output_dir = './dataset'
os.makedirs(output_dir, exist_ok=True)

system_prompt_msg = f"""You are the SME (Subject Matter Expert) in {DOMAIN}. Please answer the questions accurately. If the question is in {LANGUAGE}, write your answer in {LANGUAGE}."""

save_filename = "advertising"
oai_qa_pair = convert_to_oai_format(question_answer_list, system_prompt_msg=system_prompt_msg)

#save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
save_jsonl(oai_qa_pair, f"{output_dir}/{save_filename}-oai.jsonl")

In [None]:
!rm -rf pdf_image_tmp outputs_tmp image