# Generate QnA synthetic dataset from CSV, containing Image URL

This is another common case. If image url information is included, change this url to a summary result for that image.


## 1. Read & Preprocess CSV file
---

Read multiple csv files into a single dataframe.

In [None]:
import time
import glob
import pandas as pd
import os, shutil, random
from dotenv import load_dotenv
from langchain_community.document_loaders.csv_loader import CSVLoader
from util.preprocess import convert_html_to_md
from util.common_utils import get_language_code

load_dotenv()

DOMAIN = "SLM Fine-tuning"
LANGUAGE = "English" # You can change your language here. e.g., "Korean", "Japanese", "Chinese"
LANGUAGE_CODE = get_language_code(LANGUAGE)
print(f"Domain: {DOMAIN}, Language: {LANGUAGE}, Language Code: {LANGUAGE_CODE}")

raw_data_dir = "../raw_data"
csv_path = f"{raw_data_dir}/csv/en-tech-blog-with-url.csv"
df = pd.read_csv(csv_path) 

Convert to a base64 image input format that can be recognized by multimodal models such as GPT-4o.

- Download the image (http://xyz.com/a.jpg)
- Convert to image to base64 encoded strin

In [None]:
import json
from util.preprocess import encode_url_image_base64

def encode_images(img_dict):
    return [encode_url_image_base64(v) for k, v in img_dict.items()]

img_dict = eval(df['image_info'][0])
df['image_info'] = df['image_info'].apply(lambda x: eval(x))  
df['image_base64'] = df['image_info'].apply(encode_images)

In [None]:
df.head(2)

#### Image Summarization using GPT-4o

In [None]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=700,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

human_prompt_main = f"Given image, give a concise summary in {LANGUAGE}. Don't insert any XML tag such as <text> and </text> when answering."

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": human_prompt_main
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()

In [None]:
%%time
df['image_summary'] = df['image_base64'].apply(lambda x: summarize_chain.batch(x, {"max_concurrency": 5}))

When temporarily saving and reloading csv, you must perform `eval()` on image-related columns.

In [None]:
# tmp_csv_path = f"{raw_data_dir}/csv/ko-tech-blog-with-url-tmp.csv"
# df.to_csv(tmp_csv_path, index=False)
# df = pd.read_csv(tmp_csv_path)
# df['image_info'] = df['image_info'].apply(lambda x: eval(x))
# df['image_base64'] = df['image_base64'].apply(lambda x: eval(x))  
# df['image_summary'] = df['image_summary'].apply(lambda x: eval(x))  

Change the custom tag (e.g. [image]) contained in the content of the **context** column to the content in the **image_summary** column

In [None]:
def replace_image_tags(row, image_tag="[image]"):
    parts = row['context'].split(image_tag)
    new_context = ''
    for i in range(len(parts) - 1):
        new_context += parts[i] + row['image_summary'][i]
    new_context += parts[-1]
    return new_context

your_image_tag = '[image]'

df['final_context'] = df.apply(replace_image_tags, axis=1, image_tag=your_image_tag)
preprocessed_docs = df['final_context'].to_list()

## 2. Construct QnA Pairs
---

In [None]:
from util.qa import CustomQADataGenerator
model_config = {
    "deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    "model": "gpt-4o",
    "max_tokens": 1024,
}

qa_generator = CustomQADataGenerator(model_config=model_config, templates_dir=f"./prompt_template/{LANGUAGE_CODE}")

In [None]:
import asyncio
from collections import Counter
from typing import Dict
import os
from azure.ai.generative.synthetic.qa import QAType
concurrency = 6  # number of concurrent calls
sem = asyncio.Semaphore(concurrency)

#qa_type = QAType.CONVERSATION
qa_type = QAType.LONG_ANSWER

async def generate_async(text: str) -> Dict:
    async with sem:
        return await qa_generator.generate_async(
            text=text,
            qa_type=qa_type,
            num_questions=3,  # Number of questions to generate per text
        )

In [None]:
t0 = time.time()
# Process only some samples for debug purposes
input_batch = preprocessed_docs[:10]
results = await asyncio.gather(*[generate_async(text) for text in input_batch], return_exceptions=True)

question_answer_list = []
token_usage = Counter()
for result in results:
    if isinstance(result, Exception):
        raise result  # exception raised inside generate_async()
    question_answer_list.append(result["question_answers"])
    token_usage += result["token_usage"]

t1 = time.time()
print(f"Successfully generated QA. Generating took {t1 - t0:.4f} seconds.")    

## 3. Save to jsonl for fine-tuning
---

In [None]:
import json
from util.common_utils import convert_to_oai_format, save_jsonl

output_dir = './dataset'
os.makedirs(output_dir, exist_ok=True)

system_prompt_msg = f"""You are the SME (Subject Matter Expert) in {DOMAIN}. Please answer the questions accurately. If the question is in {LANGUAGE}, write your answer in {LANGUAGE}."""

save_filename = "tech-blog"
oai_qa_pair = convert_to_oai_format(question_answer_list, system_prompt_msg=system_prompt_msg)

#save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
save_jsonl(oai_qa_pair, f"{output_dir}/{save_filename}-oai.jsonl")