In [None]:
import dlt
import os
from dotenv import load_dotenv
import duckdb
from preprocess_raw_html import preprocess_raw_html # created function
from openai import AzureOpenAI
import time
import pandas as pd
import jsonlines
import json

load_dotenv(override=True)

### Get the Philosophy Questions Stored in DuckDB

In [None]:
pipeline = dlt.pipeline(
    pipeline_name="philosophy_questions_incremental",
    destination="duckdb",
    dataset_name="philosophy_questions",
)

conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

conn.sql(f"SET search_path = '{pipeline.dataset_name}'")
print('Loaded tables: ')
display(conn.sql("show tables"))

In [None]:
questions = conn.sql("SELECT * FROM philosophy_questions__items").df()
display(questions)

In [None]:
questions.info()

##### Answered Questions

In [17]:
answered_questions = questions[questions["is_answered"]==True]

In [None]:
answered_questions.info()

In [19]:
answered_questions_body = answered_questions[["body","link"]].iloc[::-1]

In [None]:
answered_questions_body

In [None]:
print(preprocess_raw_html(answered_questions_body["body"].iloc[0]))

### Generate Answers with OpenAI gpt-35-turbo (model version 0125)

In [22]:
client = AzureOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    api_version=os.getenv("OPENAI_API_VERSION"),
    azure_endpoint = os.getenv("OPENAI_API_ENDPOINT")
    )
    
deployment_name='gpt-35-turbo' 

In [23]:
def get_gpt_answers(client,answered_questions_body,start=0,finish=500,filename="gpt35_0125_philosophy_answers.jsonl"):
    idx = start
    while True:
        if idx==finish:
            break
        else:
            print(f"Working on question {idx} ...")
            prompt = preprocess_raw_html(answered_questions_body["body"].iloc[idx])
            try:
                response = client.chat.completions.create(
                            model="gpt-35-turbo", 
                            messages=[
                                {"role": "system", "content": "Assistant is a large language model trained by OpenAI."},
                                {"role": "user", "content": prompt}
                            ])
            except Exception as e:
                print(e)
                print("Retrying in 60 seconds...")
                with open('openai-gpt35-0125-log.txt', 'a') as f:
                    f.write(str(idx)+'-'+"fail"+'\n')  
                idx+=1
                time.sleep(60)   
                continue
        
        with jsonlines.open(filename, 'a') as writer:
            writer.write(response.json())
            print(f"JSON data written to {filename}")      

        with open('openai-gpt35-0125-log.txt', 'a') as f:
            f.write(str(idx)+'-'+"success"+'\n')  

        idx+=1

In [None]:
with open('openai-gpt35-0125-log.txt', 'r') as f:
    for line in f:
        checkpoint = int(line.strip().split("-")[0])

get_gpt_answers(client,answered_questions_body,start=checkpoint+1,finish=2000)

# code to get the response
# print(response.model_dump_json(indent=2))
# print(response.choices[0].message.content)

In [12]:
# with open('openai-gpt35-0125-log.txt', 'r') as f:
#     for line in f:
#         checkpoint = int(line.strip().split("-")[0])

# for i in range(checkpoint+1,2000):
#     prompt = preprocess_raw_html(answered_questions_body["body"].iloc[i])
#     # main dictionary for storing objects for batch processing
#     d = {}
#     d["custom_id"] = f"task-{i}"
#     d["method"] = "POST"
#     d["url"] = "/chat/completions"
#     # dictionary for storing the body objects
#     body = {}
#     body["model"] = "gpt-35-turbo"
#     messages = [
#         {"role": "system", "content": "Assistant is a large language model trained by OpenAI."},
#         {"role": "user", "content": prompt}
#     ]
#     body["messages"] = messages

#     d["body"] = body

#     with jsonlines.open("openai-gpt35-0125-batch-job.jsonl", 'a') as writer:
#         writer.write(json.dumps(d)+"\n")
    

In [13]:
# @dlt.resource(table_name="gpt35_0125_philosophy_answers", write_disposition="append")
# def get_gpt_answers(client,answered_questions_body,start=0,finish=500):

#     checkpoint = dlt.current.resource_state().setdefault("checkpoint", start)
#     while True:
#         idx = int(dlt.current.resource_state()["checkpoint"])
#         if idx==finish:
#             break
#         else:
#             print(f"Working on question {idx} ...")
#             prompt = preprocess_raw_html(answered_questions_body["body"].iloc[idx])
#             try:
#                 response = client.chat.completions.create(
#                             model="gpt-35-turbo", 
#                             messages=[
#                                 {"role": "system", "content": "Assistant is a large language model trained by OpenAI."},
#                                 {"role": "user", "content": prompt}
#                             ])
#             except Exception as e:
#                 print(e)
#                 print("Retrying in 60 seconds...")
#                 time.sleep(60)    
#                 continue
            
#         yield response.json()
#         dlt.current.resource_state()["checkpoint"]+=1

In [None]:
# pipeline = dlt.pipeline(
#     pipeline_name="gpt35_0125_philosophy_answers_incremental",
#     destination="duckdb",
#     dataset_name="gpt35_0125_philosophy_answers",
# )

# load_info = pipeline.run(get_gpt_answers(client,answered_questions_body))
# row_counts = pipeline.last_trace.last_normalize_info

# print(row_counts)
# print("------")
# print(load_info)