In [2]:
from openai import AzureOpenAI
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os
import PyPDF2
from math import ceil


load_dotenv()

client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_API_ENDPOINT"),
    api_version="2024-02-01",
    api_key=os.getenv("AZURE_OPENAI_API_KEY")
)

In [3]:
# chunk the input document 
data_path = "data/vampires/Vampire - Wikipedia.pdf"
CHUNK_SIZE = 512
embedding_deployment = "embed"

text=""
with open(data_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    num_pages = len(reader.pages)
    for page_num in range(num_pages):
        page = reader.pages[page_num]
        text += page.extract_text()

num_chunks = ceil(len(text) / CHUNK_SIZE)

In [4]:
def build_langchain_embeddings():

    embedding_client = AzureOpenAIEmbeddings(
        azure_endpoint=os.getenv("AZURE_OPENAI_API_ENDPOINT"),
        api_version="2024-02-01",
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        deployment="embed"
    )

    return embedding_client

embed_client = build_langchain_embeddings()


In [5]:
text_splitter = SemanticChunker(embed_client, number_of_chunks=CHUNK_SIZE)
chunks = text_splitter.create_documents([text])
chunks = [chunk.page_content for chunk in chunks]

In [6]:
from typing import Literal, Any

def strip_str(s: str) -> str:
    """
    Helper function for helping format strings returned by GPT-4.
    """
    l, r = 0, len(s)-1
    beg_found = False
    for i in range(len(s)):
        if s[i].isalpha():
            if not beg_found:
                l = i
                beg_found = True
            else:
                r = i 
    r += 2
    return s[l:min(r, len(s))]

def generate_instructions_gen(client: AzureOpenAI, chunk: Any, x: int = 5, model: str = None) -> list[str]:
    """
    Generates `x` questions / use cases for `chunk`. Used when the input document is of general types 
    `pdf`, `json`, or `txt`.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a synthetic question-answer pair generator. Given a chunk of context about some topic(s), generate %s example questions a user could ask and would be answered using information from the chunk. For example, if the given context was a Wikipedia paragraph about the United States, an example question could be 'How many states are in the United States?'" % (x)},
            {"role": "system", "content": "The questions should be able to be answered in a few words or less. Include only the questions in your response."},
            {"role": "user", "content": str(chunk)}
        ]
    )

    queries = response.choices[0].message.content.split('\n')
    queries = [strip_str(q) for q in queries]
    queries = [q for q in queries if any(c.isalpha() for c in q)]

    return queries 

In [7]:
chunk = chunks[0]

queries = generate_instructions_gen(client, chunk, x=5, model="gpt-4o-global")

In [8]:
qs = queries
qs

['Who is the author of "The Vampire" from ',
 'What does a vampire generally feed on?',
 'In which type of folklore are vampires undead humanoid creatures?',
 'What kind of activities did vampires engage in according to European folklore?',
 'When was "The Vampire" by Philip Burne-Jones published?']

In [9]:
import datasets
from datasets import Dataset, load_dataset
import random

def encode_question_gen(question: str, chunk: Any) -> list[str]:
    """
    Encode multiple prompt instructions into a single string for the general case (`pdf`, `json`, or `txt`).
    """
    
    prompts = []
        
    prompt = """
        Question: {question}\nContext: {context}\n
        Answer this question using the information given in the context above. Here is things to pay attention to: 
        - First provide step-by-step reasoning on how to answer the question. 
        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context. 
        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
        You MUST begin your final answer with the tag "<ANSWER>:".
    """.format(question=question, context=str(chunk))
    prompts.append({"role": "system", "content": "You are a helpful question answerer who can provide an answer given a question and relevant context."})
    prompts.append({"role": "user", "content": prompt})
    return prompts

def generate_label(client: AzureOpenAI, question: str, context: Any, doctype: Any = "pdf", model: str = None) -> str | None:
    """
    Generates the label / answer to `question` using `context` and GPT-4.
    """
    question = encode_question_gen(question, context)
    response = client.chat.completions.create(
        model=model,
        messages=question,
        n=1,
        temperature=0
    )
    response = response.choices[0].message.content
    return response

def add_chunk_to_dataset(
    client: AzureOpenAI,
    chunks: list[str], 
    chunk: str, 
    x: int = 5, 
    num_distract: int = 3, 
    p: float = 0.8,
    model: str = None
) -> None:
    """
    Given a chunk, create {Q, A, D} triplets and add them to the dataset.
    """
    global ds
    i = chunks.index(chunk)
    try:
        qs = generate_instructions_gen(client, chunk, x, model)
    except:
        return None
    for q in qs:
        datapt = {
            "id": None,
            "type": None,
            "question": None,
            "context": None,
            "oracle_context": None,
            "cot_answer": None
        }

        datapt["id"] = f"seed_task_{0 if not ds else ds.num_rows}"
        datapt["type"] = "general"
        datapt["question"] = q

        # add num_distract distractor docs
        docs = [chunk]
        indices = list(range(0, len(chunks)))
        indices.remove(i)
        for j in random.sample(indices, num_distract):
            docs.append(chunks[j])
        
        # decides whether to add oracle document
        oracle = random.uniform(0, 1) < p
        if not oracle:
            docs[0] = chunks[random.sample(indices, 1)[0]]
        random.shuffle(docs)

        d = {
            "title": [],
            "sentences": []
        }

        d["title"].append(["placeholder_title"]*(num_distract+1))
        d["sentences"].append(docs)
        datapt["context"] = d
        datapt["oracle_context"] = chunk

        # add answer to q
        try:
            datapt["cot_answer"] = generate_label(client, q, chunk, doctype="pdf", model=model)
        except:
            continue

        # construct model instruction 
        context = ""
        for doc in docs:
            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
        context += q
        datapt["instruction"] = context

        # add to dataset
        if not ds:
            # init ds
            datapt["id"] = [datapt["id"]]
            datapt["type"] = [datapt["type"]]
            datapt["question"] = [datapt["question"]]
            datapt["context"] = [datapt["context"]]
            datapt["oracle_context"] = [datapt["oracle_context"]]
            datapt["cot_answer"] = [datapt["cot_answer"]]
            datapt["instruction"] = [datapt["instruction"]]
            ds = Dataset.from_dict(datapt)
        else:
            ds = ds.add_item(datapt)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
ds = None
for i in range(len(chunks)):
    chunk = chunks[i]
    print(f"Processing chunk {i+1}/{len(chunks)}")
    add_chunk_to_dataset(client, chunks, chunk, "pdf", 5, 3, model="gpt-4o-global")

Processing chunk 1/186
Processing chunk 2/186
Processing chunk 3/186
Processing chunk 4/186
Processing chunk 5/186
Processing chunk 6/186
Processing chunk 7/186
Processing chunk 8/186
Processing chunk 9/186
Processing chunk 10/186
Processing chunk 11/186
Processing chunk 12/186
Processing chunk 13/186
Processing chunk 14/186
Processing chunk 15/186
Processing chunk 16/186
Processing chunk 17/186
Processing chunk 18/186
Processing chunk 19/186
Processing chunk 20/186
Processing chunk 21/186
Processing chunk 22/186
Processing chunk 23/186
Processing chunk 24/186
Processing chunk 25/186
Processing chunk 26/186
Processing chunk 27/186
Processing chunk 28/186
Processing chunk 29/186
Processing chunk 30/186
Processing chunk 31/186
Processing chunk 32/186
Processing chunk 33/186
Processing chunk 34/186
Processing chunk 35/186
Processing chunk 36/186
Processing chunk 37/186
Processing chunk 38/186
Processing chunk 39/186
Processing chunk 40/186
Processing chunk 41/186
Processing chunk 42/186
P

In [22]:
training_df = ds.to_pandas()

In [23]:
training_df["messages"] = training_df.apply(lambda x: [
                                                     {"role":"user", "content":x['instruction']},
                                                     {"role":"assistant", "content":x['cot_answer']}
                                                     ], axis=1)

In [24]:
training_df.messages.values[0]

[{'role': 'user',
  'content': '<DOCUMENT>In somecases, especially in small localities, beliefs are still rampant and sightings or claims of vampire attacksoccur frequently.</DOCUMENT>\n<DOCUMENT>She appeared as an attractivewoman with long black hair that covered a hole in the back of her neck, with which she sucked theblood of children.</DOCUMENT>\n<DOCUMENT>Vampire were released during the jiangshi cinematic boom of the 1980s and1990s.[116][117]In modern fiction, the vampire tends to be depicted as a suave, charismatic villain.[22] Vampirehunting societies still exist, but they are largely formed for social reasons.[20] Allegations of vampireattacks swept through Malawi during late 2002 and early 2003, with mobs stoning one person todeath and attacking at least four others, including Governor Eric Chiwaya, based on the belief that thegovernment was colluding with vampires.[118] Fears and violence recurred in late 2017, with 6 peopleaccused of being vampires killed.[119]In early 1970

In [30]:
training_df.dropna(subset=['cot_answer'], inplace=True)

In [31]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(training_df, test_size=0.2, random_state=42)

In [32]:
if not os.path.exists("data/training_data"):
    os.makedirs("data/training_data")
train_df[['messages']].to_json("data/training_data/vampires_train.jsonl", orient="records", lines=True)
test_df[['messages']].to_json("data/training_data/vampires_test.jsonl", orient="records", lines=True)

In [33]:
test_df.to_json("data/training_data/vampires_test_with_metadata.json", orient="records", lines=True)