In [1]:
import os
import logging
import random
from typing import Any

In [2]:
from dotenv import dotenv_values
from datasets import Dataset, load_dataset
from llama_index.packs.raft_dataset import RAFTDatasetPack
import openai

In [3]:
config = dotenv_values("./backend/.env") 

In [4]:
logging.basicConfig(level=logging.INFO)

In [5]:
openai_client = openai.OpenAI(
    api_key=config["OPENAI_API_KEY"],
    organization=config["ORGANIZATION_ID"],
    project=config["PROJECT_ID"],
)

In [6]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

LLM = OpenAI(model="gpt-4o-2024-05-13", api_key=config["OPENAI_API_KEY"])

EMBED_MODEL = OpenAIEmbedding(model="text-embedding-3-small", api_key=config["OPENAI_API_KEY"]) 

### Test with a sample pdf

In [6]:
sample_raft_dataset = RAFTDatasetPack(file_path="./data/sample.pdf",
                                      llm=LLM,
                                      embed_model=EMBED_MODEL,
                                      num_questions_per_chunk=5,
                                      num_distract_docs=3,
                                      chunk_size=400
                                      )

In [None]:
sample_dataset = sample_raft_dataset.run()

In [None]:
output_path = "./data/sample_pdf_dataset/sample"

sample_dataset.to_json(output_path + ".jsonl")

### PowerBI DAX PDF

In [7]:
# You can use any llama-hub loader to get documents!
dax_raft_dataset = RAFTDatasetPack(file_path="./data/DAX/14_power-bi-dax.pdf",
                                      llm=LLM,
                                      embed_model=EMBED_MODEL,
                                      num_questions_per_chunk=5,
                                      num_distract_docs=3,
                                      chunk_size=1024
                                      )

In [None]:
dax_dataset = dax_raft_dataset.run()

In [None]:
output_path = "./data/dax_pdf_dataset/dax"

dax_dataset.to_json(output_path + ".jsonl")

## Create 