In [114]:
import json
import os
import shutil
import glob
import time

import pandas as pd

from io import StringIO
from openai import OpenAI

In [115]:
credential_loc = "../../credentials.json"
batch_loc = "../../../../datasets/blogger/batch_lexical_difference/"
batch_sent_loc = "../../../../datasets/blogger/batch_lex_sent/"
batch_complete_loc = "../../../../datasets/blogger/batch_lex_complete/"

In [116]:
with open(credential_loc, 'r') as f:
    data = json.load(f)
    
os.environ["OPENAI_API_KEY"] = data['OPENAI_API_KEY']

client = OpenAI()

In [117]:
# List all .jsonl files in the batch_loc directory
files_to_be_processed = [
    f for f in os.listdir(batch_loc)
    if os.path.isfile(os.path.join(batch_loc, f)) and f.endswith('.jsonl')
]

In [118]:
len(files_to_be_processed)

0

In [119]:
# Function to process a single file with a custom description
def process_file(file_path, description):
    # Open the file and create a batch input file
    with open(file_path, "rb") as f:
        batch_input_file = client.files.create(file=f, purpose="batch")
    
    batch_input_file_id = batch_input_file.id

    # Create a batch job with the batch input file and custom description
    client.batches.create(
        input_file_id=batch_input_file_id,



























        
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": description
        }
    )
    print(f"Processed file: {file_path} with description: {description}")

In [120]:
def process_all_files(batch_loc, batch_sent_loc, files, batch_size=5, wait_time=600):

    total_files = len(files)
    
    for i in range(0, total_files, batch_size):
        batch_files = files[i:i+batch_size]
        
        for file_name in batch_files:
            file_path = os.path.join(batch_loc, file_name)
            description = os.path.splitext(file_name)[0]
            process_file(file_path, description)
            
            # Move the processed file to batch_sent_loc directory
            shutil.move(file_path, os.path.join(batch_sent_loc, file_name))
            print(f"Moved file: {file_name} to {batch_sent_loc}")
        
        print(f"Processed {len(batch_files)} files. Waiting for {wait_time} seconds...")
        time.sleep(wait_time)  # Wait for the specified time before processing the next batch

    print("All files processed and moved successfully.")

In [121]:
def save_as_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')

In [122]:
process_all_files(batch_loc, batch_sent_loc, files_to_be_processed)

All files processed and moved successfully.


In [123]:
def fetch_batches(client, after=None):
    """
    Fetch batches from the client, optionally using an 'after' parameter to paginate.
    
    :param client: API client instance.
    :param after: The ID to paginate after.
    :return: List of batch data.
    """
    response = client.batches.list(limit=100, after=after)
    return response.data

def process_batches(client):
    """
    Process batches by fetching data, sorting, and paginating until no new data is retrieved.
    
    :param client: API client instance.
    :return: DataFrame containing all batch data.
    """
    batch_data = []
    after = None

    while True:
        # Fetch the current batch of data
        current_batches = fetch_batches(client, after)
        if not current_batches:
            print("No more batches to fetch.")
            break
        
        # Extract attributes and convert to DataFrame
        for batch in current_batches:
            batch_dict = {
                "id": batch.id,
                "completion_window": batch.completion_window,
                "created_at": batch.created_at,
                "endpoint": batch.endpoint,
                "input_file_id": batch.input_file_id,
                "object": batch.object,
                "status": batch.status,
                "cancelled_at": batch.cancelled_at,
                "cancelling_at": batch.cancelling_at,
                "completed_at": batch.completed_at,
                "error_file_id": batch.error_file_id,
                "errors": batch.errors,
                "expired_at": batch.expired_at,
                "expires_at": batch.expires_at,
                "failed_at": batch.failed_at,
                "finalizing_at": batch.finalizing_at,
                "in_progress_at": batch.in_progress_at,
                "metadata_description": batch.metadata.get('description', ''),
                "output_file_id": batch.output_file_id,
                "request_counts_completed": batch.request_counts.completed,
                "request_counts_failed": batch.request_counts.failed,
                "request_counts_total": batch.request_counts.total
            }
            batch_data.append(batch_dict)
        
        # Create a DataFrame from the current batch data
        batch_df = pd.DataFrame(batch_data)

        # Remove duplicate rows based on 'id'
        batch_df.drop_duplicates(subset='id', keep='last', inplace=True)
        
        # Sort DataFrame by 'created_at' column
        batch_df.sort_values(by='created_at', ascending=True, inplace=True)
        
        # Print the current DataFrame state for debugging
        print(f"Current DataFrame shape: {batch_df.shape}")
        
        # Update the 'after' parameter with the last batch ID for pagination
        last_batch_id = batch_df['id'].iloc[-1] if not batch_df.empty else None
        if last_batch_id == after:
            print("No new batches found.")
            break
        after = last_batch_id
    
    return batch_df

In [124]:
# fetch_batches(client)

In [125]:
batch_df = process_batches(client)

Current DataFrame shape: (100, 22)
Current DataFrame shape: (101, 22)
No new batches found.


In [126]:
batch_df

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total
199,batch_XfBZk05jzQFLDj2GBHe6pq3u,24h,1723672875,/v1/chat/completions,file-8Q3SIMf8C06JMRLNOkJCHlnk,batch,completed,,,1723678756,...,,1723759275,,1723678704,1723672876,batch_171105,file-EVpH09R0q5PPlCo217pR5mXL,360,0,360
198,batch_6733c46065f081909dc464de74e183c5,24h,1731445856,/v1/chat/completions,file-dcM9InwUFyIUV6EBXOksfse2,batch,completed,,,1731447039,...,,1731532256,,1731446985,1731445857,batch_617540,file-lm66OG6AYZaFy2UgT0jEmwmF,470,0,470
197,batch_6733c463799c8190bbd2c5f484a85681,24h,1731445859,/v1/chat/completions,file-uGSKMDmktjB4mN9xnMvCp9Oo,batch,completed,,,1731446092,...,,1731532259,,1731446069,1731445920,batch_579188,file-8hGTyQEGGukOVEBLy8cXIffr,300,0,300
196,batch_6733c46504308190b1faa4560e9b5d74,24h,1731445861,/v1/chat/completions,file-LUWtcPApvS8dlSJGVo8fcin9,batch,completed,,,1731446148,...,,1731532261,,1731446121,1731445862,batch_448015,file-fqTIKpOpmzhiJ53tL2TB5qUD,320,0,320
195,batch_6733c466b9dc81908ac79deffec76720,24h,1731445862,/v1/chat/completions,file-tp4ydSkJK3YSQCmbNwt3Kbz4,batch,completed,,,1731446212,...,,1731532262,,1731446181,1731445863,batch_671397,file-8ZZldEBZaQwqQKthzZ72FQsW,390,0,390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,batch_6733f1d3ec0881908695a6f35d35c27d,24h,1731457492,/v1/chat/completions,file-HRTgFE98aCgMFZjMJoJRhKPN,batch,completed,,,1731459094,...,,1731543892,,1731459067,1731457493,batch_585356,file-6MNev6dNLO4CrtEvIHSLFOoU,300,0,300
102,batch_6733f1d692748190b2cca97e4f75b9fe,24h,1731457494,/v1/chat/completions,file-2Om6aYPbCzoCxqSRgPzr9BFK,batch,completed,,,1731459170,...,,1731543894,,1731459067,1731457495,batch_551652,file-FHAewWpqm9S31lzjLNPkXGqk,250,0,250
101,batch_6733f1d993f88190ae6b8db632005d5b,24h,1731457497,/v1/chat/completions,file-8uyUWpryBAcRmBnf0xiRxsh8,batch,completed,,,1731459114,...,,1731543897,,1731459068,1731457499,batch_491687,file-F29qNepzCsQ8tvZXKjniCZN5,480,0,480
100,batch_6733f1dad34481908c2e108ab48bf10b,24h,1731457498,/v1/chat/completions,file-H7Agsf8Uejjq7VfMNVy5oT3D,batch,completed,,,1731458875,...,,1731543898,,1731458747,1731457499,batch_463289,file-kfyTWy1Ql5v1ewkhaALc2oHQ,150,0,150


In [127]:
failed_df = batch_df[batch_df['status']== 'failed']

In [128]:
completed_df = batch_df[batch_df['status']== 'completed']

In [129]:
in_progress_df = batch_df[batch_df['status']== 'in_progress']

In [130]:
print(f"Failed: {len(failed_df)} - Completed: {len(completed_df)} - In Progress {len(in_progress_df)}")

Failed: 0 - Completed: 101 - In Progress 0


In [131]:
in_progress_df

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total


In [132]:
completed_df

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total
199,batch_XfBZk05jzQFLDj2GBHe6pq3u,24h,1723672875,/v1/chat/completions,file-8Q3SIMf8C06JMRLNOkJCHlnk,batch,completed,,,1723678756,...,,1723759275,,1723678704,1723672876,batch_171105,file-EVpH09R0q5PPlCo217pR5mXL,360,0,360
198,batch_6733c46065f081909dc464de74e183c5,24h,1731445856,/v1/chat/completions,file-dcM9InwUFyIUV6EBXOksfse2,batch,completed,,,1731447039,...,,1731532256,,1731446985,1731445857,batch_617540,file-lm66OG6AYZaFy2UgT0jEmwmF,470,0,470
197,batch_6733c463799c8190bbd2c5f484a85681,24h,1731445859,/v1/chat/completions,file-uGSKMDmktjB4mN9xnMvCp9Oo,batch,completed,,,1731446092,...,,1731532259,,1731446069,1731445920,batch_579188,file-8hGTyQEGGukOVEBLy8cXIffr,300,0,300
196,batch_6733c46504308190b1faa4560e9b5d74,24h,1731445861,/v1/chat/completions,file-LUWtcPApvS8dlSJGVo8fcin9,batch,completed,,,1731446148,...,,1731532261,,1731446121,1731445862,batch_448015,file-fqTIKpOpmzhiJ53tL2TB5qUD,320,0,320
195,batch_6733c466b9dc81908ac79deffec76720,24h,1731445862,/v1/chat/completions,file-tp4ydSkJK3YSQCmbNwt3Kbz4,batch,completed,,,1731446212,...,,1731532262,,1731446181,1731445863,batch_671397,file-8ZZldEBZaQwqQKthzZ72FQsW,390,0,390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,batch_6733f1d3ec0881908695a6f35d35c27d,24h,1731457492,/v1/chat/completions,file-HRTgFE98aCgMFZjMJoJRhKPN,batch,completed,,,1731459094,...,,1731543892,,1731459067,1731457493,batch_585356,file-6MNev6dNLO4CrtEvIHSLFOoU,300,0,300
102,batch_6733f1d692748190b2cca97e4f75b9fe,24h,1731457494,/v1/chat/completions,file-2Om6aYPbCzoCxqSRgPzr9BFK,batch,completed,,,1731459170,...,,1731543894,,1731459067,1731457495,batch_551652,file-FHAewWpqm9S31lzjLNPkXGqk,250,0,250
101,batch_6733f1d993f88190ae6b8db632005d5b,24h,1731457497,/v1/chat/completions,file-8uyUWpryBAcRmBnf0xiRxsh8,batch,completed,,,1731459114,...,,1731543897,,1731459068,1731457499,batch_491687,file-F29qNepzCsQ8tvZXKjniCZN5,480,0,480
100,batch_6733f1dad34481908c2e108ab48bf10b,24h,1731457498,/v1/chat/completions,file-H7Agsf8Uejjq7VfMNVy5oT3D,batch,completed,,,1731458875,...,,1731543898,,1731458747,1731457499,batch_463289,file-kfyTWy1Ql5v1ewkhaALc2oHQ,150,0,150


In [133]:
completed_df[completed_df['output_file_id'] == 'file-EVpH09R0q5PPlCo217pR5mXL'].columns

Index(['id', 'completion_window', 'created_at', 'endpoint', 'input_file_id',
       'object', 'status', 'cancelled_at', 'cancelling_at', 'completed_at',
       'error_file_id', 'errors', 'expired_at', 'expires_at', 'failed_at',
       'finalizing_at', 'in_progress_at', 'metadata_description',
       'output_file_id', 'request_counts_completed', 'request_counts_failed',
       'request_counts_total'],
      dtype='object')

In [134]:
# Define the function to process files
def process_batch_files(batch_complete_loc, batch_sent_loc, completed_df):
    # Define the function to extract the content from the response
    def extract_content(response):
        try:
            content = response['body']['choices'][0]['message']['content']
            return content
        except (KeyError, TypeError):
            return None

    # Get the list of files already in the batch_complete_loc
    existing_files = [f.replace('.jsonl', '') for f in os.listdir(batch_complete_loc) if f.endswith('.jsonl')]

    # Filter out rows in completed_df where metadata_description matches existing files
    df_to_process = completed_df[~completed_df['metadata_description'].isin(existing_files)]

    # Loop through the rows in the filtered DataFrame
    for index, row in df_to_process.iterrows():
        metadata_description = row['metadata_description']
        output_file_id = row['output_file_id']
        print(output_file_id)

        # Call the API to get the file content
        file_response = client.files.content(output_file_id)
        jsonl_io = StringIO(file_response.text)
        df = pd.read_json(jsonl_io, lines=True)

        # Apply the function to extract the 'content' from the 'response' column
        df['response'] = df['response'].apply(extract_content)

        # Select only the required columns
        df = df[['id', 'custom_id', 'response']]

        # Save the DataFrame as a jsonl file in batch_complete_loc
        output_filepath = os.path.join(batch_complete_loc, f"{metadata_description}.jsonl")
        df.to_json(output_filepath, orient='records', lines=True)

        # Move the file from batch_sent_loc to batch_complete_loc
        sent_filepath = os.path.join(batch_sent_loc, f"{metadata_description}.jsonl")
        if os.path.exists(sent_filepath):
            os.remove(sent_filepath)
            print(f"File {sent_filepath} moved to {batch_complete_loc}")

In [135]:
completed_filter = completed_df[completed_df['id'] != 'batch_XfBZk05jzQFLDj2GBHe6pq3u']

In [136]:
completed_filter

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total
198,batch_6733c46065f081909dc464de74e183c5,24h,1731445856,/v1/chat/completions,file-dcM9InwUFyIUV6EBXOksfse2,batch,completed,,,1731447039,...,,1731532256,,1731446985,1731445857,batch_617540,file-lm66OG6AYZaFy2UgT0jEmwmF,470,0,470
197,batch_6733c463799c8190bbd2c5f484a85681,24h,1731445859,/v1/chat/completions,file-uGSKMDmktjB4mN9xnMvCp9Oo,batch,completed,,,1731446092,...,,1731532259,,1731446069,1731445920,batch_579188,file-8hGTyQEGGukOVEBLy8cXIffr,300,0,300
196,batch_6733c46504308190b1faa4560e9b5d74,24h,1731445861,/v1/chat/completions,file-LUWtcPApvS8dlSJGVo8fcin9,batch,completed,,,1731446148,...,,1731532261,,1731446121,1731445862,batch_448015,file-fqTIKpOpmzhiJ53tL2TB5qUD,320,0,320
195,batch_6733c466b9dc81908ac79deffec76720,24h,1731445862,/v1/chat/completions,file-tp4ydSkJK3YSQCmbNwt3Kbz4,batch,completed,,,1731446212,...,,1731532262,,1731446181,1731445863,batch_671397,file-8ZZldEBZaQwqQKthzZ72FQsW,390,0,390
194,batch_6733c4683ccc81908355a6a3af1dc2ff,24h,1731445864,/v1/chat/completions,file-p9mDv1QBAgUbOiEf0EPVWPsL,batch,completed,,,1731446141,...,,1731532264,,1731446121,1731445865,batch_543206,file-JKj9HmzTGIIxy3ERfrWuQ4MR,250,0,250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,batch_6733f1d3ec0881908695a6f35d35c27d,24h,1731457492,/v1/chat/completions,file-HRTgFE98aCgMFZjMJoJRhKPN,batch,completed,,,1731459094,...,,1731543892,,1731459067,1731457493,batch_585356,file-6MNev6dNLO4CrtEvIHSLFOoU,300,0,300
102,batch_6733f1d692748190b2cca97e4f75b9fe,24h,1731457494,/v1/chat/completions,file-2Om6aYPbCzoCxqSRgPzr9BFK,batch,completed,,,1731459170,...,,1731543894,,1731459067,1731457495,batch_551652,file-FHAewWpqm9S31lzjLNPkXGqk,250,0,250
101,batch_6733f1d993f88190ae6b8db632005d5b,24h,1731457497,/v1/chat/completions,file-8uyUWpryBAcRmBnf0xiRxsh8,batch,completed,,,1731459114,...,,1731543897,,1731459068,1731457499,batch_491687,file-F29qNepzCsQ8tvZXKjniCZN5,480,0,480
100,batch_6733f1dad34481908c2e108ab48bf10b,24h,1731457498,/v1/chat/completions,file-H7Agsf8Uejjq7VfMNVy5oT3D,batch,completed,,,1731458875,...,,1731543898,,1731458747,1731457499,batch_463289,file-kfyTWy1Ql5v1ewkhaALc2oHQ,150,0,150


In [137]:
process_batch_files(batch_complete_loc, batch_sent_loc, completed_filter)

file-lm66OG6AYZaFy2UgT0jEmwmF
file-8hGTyQEGGukOVEBLy8cXIffr
file-fqTIKpOpmzhiJ53tL2TB5qUD
file-8ZZldEBZaQwqQKthzZ72FQsW
file-JKj9HmzTGIIxy3ERfrWuQ4MR
file-deibr8g1MrLVmvYSKFITa43m
file-XOh3VVmuiB5GYSyZ6p0vn82S
file-8Z8GPJf9p4ZKXDiaqeRh16tL
file-DLT2ENmV4CN50cxWQhgY2Tgz
file-su9oXxOYPyhOJDrDXNFYI7eq
file-ViI1FAqWsxfdE9qmOU4P0yV2
file-o5prdB5TOjuZzSxOJOc0Vdtf
file-0OXUBPZDw9Be3jV5WSs5vua9
file-JyhOwvnhWh8XDb7xQss5MPQK
file-OW5h3GDIT5Hu8aCojnzDLh8T
file-akeonkq5bBhjC8vRkFy5cArO
file-HQdWEC0XIQvdDseiH9ajiINH
file-SSXFFfEweQoKcOlQJ5O1H86g
file-ayTBhbMfjDVwSLAa4aYRXO1D
file-Nsxvyf7hEnd9WouGZWwcU1u2
file-YbpVMiwHZ48uQDnLs161JTXq
file-qFxsvK64fc51qWHB67KUgZll
file-8SujtvUB2glS287VIFtY8Rim
file-fn6z1yhwHAl7jkw3gyC9Amja
file-PAHyCtoMr4iPFMLAfN9XgVYb
file-TMZShWecLPOHJaKNSDAYKwOQ
file-RJmVYet5rQCUSMmFLyMrAqXk
file-4yKDe2IkXvhM5opmofXzbfkA
file-HnQ6EDfJidMf9gwYbAPDILI2
file-Nq3Qu78kbGKQUR0SwI893PFm
file-VxNil9EVFNW60nbO4UvtUjqT
file-7RU1ItwxaTNmXVFiklmKfqWP
file-t4jvXlef4kmuYxWA5I6WpstI
file-JFVkU

In [138]:
failed_df

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total


In [112]:
# Define the function to handle failed files
def handle_failed_files(batch_loc, batch_sent, failed_df):
    # Loop through the rows in the failed_df DataFrame
    for index, row in failed_df.iterrows():
        metadata_description = row['metadata_description']
        filename = f"{metadata_description}.jsonl"
        
        # Define the source and destination file paths
        sent_filepath = os.path.join(batch_sent, filename)
        loc_filepath = os.path.join(batch_loc, filename)
        
        # Check if the file exists in batch_sent
        if os.path.exists(sent_filepath):
            # Move the file to batch_loc
            shutil.move(sent_filepath, loc_filepath)
            print(f"File {filename} moved from {batch_sent} to {batch_loc}")

In [113]:
handle_failed_files(batch_loc, batch_sent_loc, failed_df)