In [1]:
import os
import logging
import random
from typing import Any

In [2]:
from dotenv import dotenv_values
from datasets import Dataset, load_dataset
from llama_index.packs.raft_dataset import RAFTDatasetPack
import openai

In [3]:
config = dotenv_values("./backend/.env") 

In [4]:
logging.basicConfig(level=logging.INFO)

In [6]:
openai_client = openai.OpenAI(
    api_key=config["OPENAI_API_KEY"],
    organization=config["ORGANIZATION_ID"],
    project=config["PROJECT_ID"],
)

In [6]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

LLM = OpenAI(model="gpt-4o-2024-05-13", api_key=config["OPENAI_API_KEY"])

EMBED_MODEL = OpenAIEmbedding(model="text-embedding-3-small", api_key=config["OPENAI_API_KEY"]) 

### Test with a sample pdf

In [6]:
sample_raft_dataset = RAFTDatasetPack(file_path="./data/sample.pdf",
                                      llm=LLM,
                                      embed_model=EMBED_MODEL,
                                      num_questions_per_chunk=5,
                                      num_distract_docs=3,
                                      chunk_size=400
                                      )

In [None]:
sample_dataset = sample_raft_dataset.run()

In [None]:
output_path = "./data/sample_pdf_dataset/sample"

sample_dataset.to_json(output_path + ".jsonl")

In [18]:
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watsonx_ai.foundation_models import ModelInference

In [19]:
model_inference = ModelInference(
    model_id=config["WATSONX_MODEL_ID"],
    credentials={
        "apikey": config["IBM_CLOUD_API_KEY"],
        "url": config["IBM_CLOUD_URL"]
    },
    project_id=config["WATSONX_PROJECT_ID"]
    )

INFO:ibm_watsonx_ai.client:Client successfully initialized
INFO:ibm_watsonx_ai.wml_resource:Successfully finished Get available foundation models for url: 'https://us-south.ml.cloud.ibm.com/ml/v1/foundation_model_specs?version=2024-09-09&project_id=1ebac0e0-ee31-4a71-a95d-78bf576b69eb&filters=function_text_generation%2C%21lifecycle_withdrawn%3Aand&limit=200'


In [20]:
model_inference.generate_text("Hello, how are you?")

INFO:httpx:HTTP Request: POST https://us-south.ml.cloud.ibm.com/ml/v1/text/generation?version=2024-09-09 "HTTP/1.1 200 OK"
INFO:ibm_watsonx_ai.wml_resource:Successfully finished generate for url: 'https://us-south.ml.cloud.ibm.com/ml/v1/text/generation?version=2024-09-09'


" I'm doing well, thank you for asking. I've been working on a project lately, and"

In [22]:
additional_params = {
    "decoding_method": "sample",
    "min_new_tokens": 1,
    "top_k": 50,
    "top_p": 1,
}

In [24]:
from llama_index.llms.ibm import WatsonxLLM

watsonx_llm = WatsonxLLM(
    model_id=config["WATSONX_MODEL_ID"],
    url=config["IBM_CLOUD_URL"],
    apikey=config["IBM_CLOUD_API_KEY"],
    project_id=config["WATSONX_PROJECT_ID"],
    temperature=0.2,
    max_new_tokens=300,
    additional_params=additional_params,
)

INFO:ibm_watsonx_ai.client:Client successfully initialized
INFO:ibm_watsonx_ai.wml_resource:Successfully finished Get available foundation models for url: 'https://us-south.ml.cloud.ibm.com/ml/v1/foundation_model_specs?version=2024-09-09&project_id=1ebac0e0-ee31-4a71-a95d-78bf576b69eb&filters=function_text_generation%2C%21lifecycle_withdrawn%3Aand&limit=200'


In [26]:
response = watsonx_llm.complete("Hello, how are you?")

INFO:httpx:HTTP Request: POST https://us-south.ml.cloud.ibm.com/ml/v1/text/generation?version=2024-09-09 "HTTP/1.1 200 OK"
INFO:ibm_watsonx_ai.wml_resource:Successfully finished generate for url: 'https://us-south.ml.cloud.ibm.com/ml/v1/text/generation?version=2024-09-09'


In [27]:
print(response.raw)

{'model_id': 'ibm/granite-13b-chat-v2', 'model_version': '2.1.0', 'created_at': '2024-09-19T20:01:59.069Z', 'results': [{'generated_text': "\n\nA: I'm doing well, thank you. And you?\n\nB: I'm good, thank you. It's a beautiful day today.\n\nA: I agree, it's a great day for a walk in the park.\n\nB: That sounds like a wonderful idea. I'd love to go. Do you want to join me?\n\nA: I'd love to! It's been a while since we last walked together.\n\nB: Me too. I've been meaning to catch up with you.\n\nA: I've been busy with work lately, but I'm making an effort to take some time for myself and do things I enjoy.\n\nB: That's great to hear. I'm sure it's good for your mental health.\n\nA: Absolutely. I've been feeling a bit overwhelmed lately, and I think a walk in the park will help me relax and rejuvenate.\n\nB: I'm sure it will. I'm looking forward to it.\n\nA: Great. Let's meet at the park entrance at 2 pm.\n\nB: Sure thing. See you then!\n\nA: Thank you.\n\nIn this dialogue, the speaker a

### PowerBI DAX PDF

In [7]:
# You can use any llama-hub loader to get documents!
dax_raft_dataset = RAFTDatasetPack(file_path="./data/DAX/14_power-bi-dax.pdf",
                                      llm=LLM,
                                      embed_model=EMBED_MODEL,
                                      num_questions_per_chunk=5,
                                      num_distract_docs=3,
                                      chunk_size=1024
                                      )

In [None]:
dax_dataset = dax_raft_dataset.run()

In [None]:
output_path = "./data/dax_pdf_dataset/dax"

dax_dataset.to_json(output_path + ".jsonl")