In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import time
import pandas

load_dotenv()

True

In [3]:
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()

In [4]:
course_df = pandas.read_csv("../data/courses.csv")
course_df

Unnamed: 0.1,Unnamed: 0,college,department,number,course,description
0,0,CAS,Anthropology,Emphasis Programs in Anthropology,,Anthropology majors have the option of complet...
1,1,CAS,Anthropology,1,Introduction to Biological Anthropology,"Using an evolutionary framework, we examine ho..."
2,2,CAS,Anthropology,2,Introduction to Archaeology,How do archaeologists understand the past? Thi...
3,3,CAS,Anthropology,3,Introduction Cultural Anthropology,This course provides an introduction to the su...
4,4,CAS,Anthropology,4,Vanished Peoples and Lost Civilizations,“Popular archaeology” is addressed by examinin...
...,...,...,...,...,...,...
2681,2681,SOE,Mechanical Engineering,194,Advanced Design I: Tools,Design tools basic to all aspects of mechanica...
2682,2682,SOE,Mechanical Engineering,195,Advanced Design II: Implementation,Implementation of design strategy. Detail desi...
2683,2683,SOE,Mechanical Engineering,196,Advanced Design III: Completion and Evaluation,"Design projects completed, assembled, tested, ..."
2684,2684,SOE,Mechanical Engineering,198,Independent Study,By arrangement with faculty. (1–5 units)


In [6]:
# name batch file to date 
BATCH_FILE = f"batch_{time.strftime('%Y%m%d')}.json"

PROMPT = """
Extract course relationships from this description. Return a JSON object with this structure:
{
    "prerequisites": {
        "type": "AND",           // How prerequisites combine: "AND" or "OR"
        "courses": [             // Can contain course codes or nested groups
            "MATH 101",          // Simple prerequisite
            {                    // Nested group for complex logic
                "type": "OR",
                "courses": ["MATH 102", "AMTH 108"],
                "min_grade": "C-"
            }
        ],
        "min_grade": "C-"       // Default grade requirement for this group
    },
    "corequisites": ["MATH 101L"],  // List of concurrent courses
    "cross_listed": ["CSCI 147"],   // Cross-listed courses
    "notes": "Permission of instructor required"  // Special conditions
}

For complex prerequisites, use nested groups with their own type (AND/OR).
Each group can have its own grade requirement.
If any field is not applicable, use null or empty array.

Example of complex prerequisites:
For "MATH 122 or AMTH 108, and MATH/CSCI 146 with C- or better":
{
    "prerequisites": {
        "type": "AND",
        "courses": [
            {
                "type": "OR",
                "courses": ["MATH 122", "AMTH 108"],
                "min_grade": "C-"
            },
            {
                "type": "OR",
                "courses": ["MATH 146", "CSCI 146"],
                "min_grade": "C-"
            }
        ]
    }
}
"""

# write the prompt to the batch file for each row in the course_df
with open(BATCH_FILE, "w") as f:
    for index, row in course_df.iterrows():
        input = {
                "custom_id": f"{row['department']}-{row['number']}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-mini",
                    "messages": [
                        {
                            "role": "user",
                            "content": f"{PROMPT}\nCourse: {row['department']} {row['number']}\nDescription: {row['description']}\n\nProvide only valid JSON as response, no other text."
                        }
                    ]
                }
        }
        f.write(json.dumps(input) + "\n")

In [None]:
# Replace with year and the batch details file path
year = '2023'
batch_details_file = f'{BATCH_DIR}/{year}_batch_details.json'

batch_details = []

with open(batch_details_file, 'r') as f:
    for line in f:
        batch_details.append(json.loads(line))

batch_details

In [10]:
# que batch job
year = '2023'
gap = '10'
batch_file_id = batch_details[4][f'{year}_{gap}_batch_id']

batch_job = client.batches.create(
    input_file_id=batch_file_id,
    endpoint='/v1/chat/completions',
    completion_window="24h"
)

### Automatically queing batches

In [4]:
def wait_for_batch_completion(client, batch_job_id, poll_interval=60):
    """
    Polls the status of the batch job until it is completed or failed.
    :param client: API client to interact with batch jobs
    :param batch_job_id: ID of the batch job to monitor
    :param poll_interval: Time (in seconds) between each status check
    :return: Status of the completed batch job (e.g., "completed", "failed")
    """
    while True:
        batch_status = client.batches.retrieve(batch_job_id).status

        if batch_status == 'completed':
            print(f"Batch {batch_job_id} completed successfully.")
            return 'completed'
        elif batch_status == 'failed':
            print(f"Batch {batch_job_id} failed.")
            return 'failed'
        elif batch_status == 'finalizing':
            print(f"Batch {batch_job_id} is finalizing, waiting for completion...")

        time.sleep(poll_interval)


In [3]:
'''
This code will que all the batches at once (all years and year gaps). Modify the loop and add calls to the wait_for_batch_completion to change how many batches to call at a given time.
NOTE: OpenAI API has limits on how many tokens you can que using batch processing, so modify as needed. 
'''

years = ['2019', '2020', '2021', '2022', '2023']
gap = ['2', '4', '6', '8', '10']

for year in years:
    batch_details_file = f'{BATCH_DIR}/{year}_batch_details.json'

    batch_details = []

    with open(batch_details_file, 'r') as f:
        for line in f:
            batch_details.append(json.loads(line))

    # for i, gap in tqdm(enumerate(gaps)):
    batch_file_id1 = batch_details[0][f'{year}_{gap[0]}_batch_id']

    batch_job1 = client.batches.create(
        input_file_id=batch_file_id1,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    batch_file_id2 = batch_details[1][f'{year}_{gap[1]}_batch_id']

    batch_job2 = client.batches.create(
        input_file_id=batch_file_id2,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    batch_file_id3 = batch_details[2][f'{year}_{gap[2]}_batch_id']

    batch_job3 = client.batches.create(
        input_file_id=batch_file_id3,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    batch_file_id4 = batch_details[3][f'{year}_{gap[3]}_batch_id']

    batch_job4 = client.batches.create(
        input_file_id=batch_file_id4,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    batch_file_id5 = batch_details[4][f'{year}_{gap[4]}_batch_id']

    batch_job5 = client.batches.create(
        input_file_id=batch_file_id5,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    # wait_for_batch_completion(client, batch_job1.id)
    # wait_for_batch_completion(client, batch_job2.id)
    # wait_for_batch_completion(client, batch_job3.id)
    # wait_for_batch_completion(client, batch_job4.id)
    # wait_for_batch_completion(client, batch_job5.id)
