In [3]:
%%bash
mkdir -p /kaggle/tmp
pip install -q openai

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 16.1.0 which is incompatible.
jupyterlab 4.2.3 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
jupyterlab-lsp 5.1.0 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.4.1 which is incompatible.
ydata-profiling 4.6.4 requires numpy<1.26,>=1.16.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [4]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
import os
import sys
import json
from openai import OpenAI

### Load csv file
Keywords will always be sorted in ascending order.

In [5]:
questions_df = pd.read_csv("/kaggle/input/questions-bank-llm20/questions_bank.csv", index_col=0)
questions_df.sort_index(inplace=True)
questions_df.index.name = 'keyword'

### Keywords List
Keywords are from csv file.

In [6]:
keywords_list = questions_df.index.tolist()

### Questions List
Append new questions to the list below
- Do not edit the existing questions
- Case sensitive

In [8]:
questions_list = [
    "Is it an object?",
    "Is it a place to go?",
    "Is it a facility?",
    "Is it tangible?",
    "Is it made of metal",
]

### Questions Queue
Select new questions from the questions_list

In [9]:
questions_queue = []

for q in questions_list:
    if q in questions_df.columns:
        continue
    questions_queue.append(q)

### OpenAI Batch Functions
Be sure to add OPENAI_API_KEY to 'secrets'

In [10]:
import json
import pandas as pd

system_prompt = """You are a helpful assistant.
    Your goal is to classify the keyword to the question.
    Answer only with 'yes' or 'no'.
    If you are unsure, classify it as 'no'.
    If you think the keyword is not suitable for the question, classify it as 'no'.
    """

def create_batch_data(keywords, questions):
    data = []
    for question in questions:
        for keyword in keywords:
            data.append(
                {
                    "custom_id": f"{question}-{keyword}",
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": "gpt-4o-mini",
                        "messages": [
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": f"Keyword: {keyword} \nQuestion: {question}"},
                        ],
                        "max_tokens": 10,
                    },
                }
            )
    return data

def create_jsonl_batch(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')  # Add a newline to separate entries
            

def read_jsonl(jsonl_output_path: str) -> pd.DataFrame:
    data = []

    with open(jsonl_output_path, 'r') as file:
        for line in file:
            request = json.loads(line)
            question, keyword = request["custom_id"].split("-")
            response = request["response"]["body"]["choices"][0]["message"]["content"]
            response = 1 if response.lower() == "yes" else 0

            # Creating a dictionary for each line
            data.append({
                'Keyword': keyword,
                'Question': question,
                'Response': response
            })

    # Creating DataFrame from list of dictionaries
    df = pd.DataFrame(data)

    # Pivoting the DataFrame to get 'Keyword' as the index
    df_pivot = df.pivot(index='Keyword', columns='Question', values='Response')

    # Reindexing columns to ensure all questions are represented
    df_pivot.sort_index(inplace=True)

    return df_pivot

### Batch Input
Create a batch input in jsonl format

In [11]:
# Specify the file path
input_file_path = '/kaggle/tmp/batch_input.jsonl'
data = create_batch_data(keywords_list, questions_queue)

# Write each dictionary to a separate line in the file
with open(input_file_path, 'w') as file:
    for entry in data:
        json.dump(entry, file)
        file.write('\n')  # Add a newline to separate entries

### Create Batch Request

In [12]:
from kaggle_secrets import UserSecretsClient


client = OpenAI(api_key = UserSecretsClient().get_secret("OPENAI_API_KEY"))

batch_input_file = client.files.create(
  file=open(input_file_path, "rb"),
  purpose="batch"
)

In [13]:
batch_job = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "keyword classification"
    }
)


### Wait for Completion
This will take up to 24 hours.


Refreshes every 60 seconds.

In [15]:
while batch_job.output_file_id is None and batch_job.errors is None:
    batch_job = client.batches.retrieve(batch_job.id)
    print(f'## output_file_id: {batch_job.output_file_id}, errors: {batch_job.errors}')

    if batch_job.output_file_id is None and batch_job.errors is None:
        print("---- Current Time:", datetime.now())
        # Sleep for 1 minute (60 seconds)
        time.sleep(60)

## output_file_id: None, errors: None
---- Current Time: 2024-08-07 06:56:58.722893
## output_file_id: None, errors: None
---- Current Time: 2024-08-07 06:57:59.000821
## output_file_id: None, errors: None
---- Current Time: 2024-08-07 06:58:59.243121
## output_file_id: None, errors: None
---- Current Time: 2024-08-07 06:59:59.526125
## output_file_id: file-DZ0BBvtmKTPp6dlsHxgLicql, errors: None


### Retrieve Batch Results

In [16]:
output_file_path = "/kaggle/tmp/batch_output.jsonl"

batch_output_file = client.files.content(batch_job.output_file_id).content
with open(output_file_path, 'wb') as file:
    file.write(batch_output_file)

In [17]:
new_questions_df = read_jsonl(output_file_path)

### Final Output


In [20]:
# combine the new data with the existing data
print("Before:", questions_df.shape)
questions_df = pd.concat([questions_df, new_questions_df], axis=1)
questions_df.index.name = 'keyword'

print("After:", questions_df.shape)
questions_df.to_csv("questions_bank.csv")

Before: (579, 151)
After: (579, 152)


In [21]:
questions_df.head()

Unnamed: 0_level_0,Is it man-made?,Is it used outdoors?,Is it something you can find in a bathroom?,Is it something people wear?,Is it something that can be eaten?,Is it something that has wheels?,Is it something that has a screen?,Is it something that has a smell?,Is it an animal?,Is it something that requires electricity or batteries?,...,Is it something that can be found in multiple colors?,Is it something that is known for its texture?,Is it something that is usually served on a plate?,Is it something that is usually served in a cup?,Is it a facility?,Is it a place to go?,Is it an object?,Is it tangible?,Is it made of metal,Is it made of metal
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Advertisement,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-2.0,-2.0,-2.0,-2.0,0,0,0,0,0,0
Agave,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
Air Conditioner,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-2.0,-2.0,-2.0,-2.0,1,0,1,1,0,0
Air compressor,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-2.0,-2.0,-2.0,-2.0,0,0,1,1,1,1
Air filter,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-2.0,-2.0,-2.0,-2.0,0,0,1,1,0,0
