In [87]:
import json
import os
import shutil
import glob
import time

import pandas as pd

from io import StringIO
from openai import OpenAI

In [88]:
credential_loc = "../../credentials.json"

data_type = "training"
corpus = "The Telegraph"

data_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/"
batch_loc = f"{data_loc}batch_sentence_preprocessed/"
raw_data_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"

# Location for data when sent to batch
batch_sent_loc = f"{data_loc}batch_sentence_sent/"
os.makedirs(batch_sent_loc, exist_ok=True)

# Location once batch complete
batch_complete_loc = f"{data_loc}batch_sentence_complete/"
os.makedirs(batch_complete_loc, exist_ok=True)

In [89]:
def read_jsonl(file_path):
    """
    Reads a JSONL file and converts it into a pandas DataFrame.

    Parameters:
    - file_path: Path to the JSONL file to read.

    Returns:
    - A pandas DataFrame containing the data from the JSONL file.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

In [90]:
raw_df = read_jsonl(raw_data_loc)
print(f"Number of documents to process: {len(raw_df['doc_id'])}")

Number of documents to process: 220


In [91]:
with open(credential_loc, 'r') as f:
    data = json.load(f)
    
os.environ["OPENAI_API_KEY"] = data['OPENAI_API_KEY']

client = OpenAI()

In [92]:
# List all .jsonl files in the batch_loc directory
files_to_be_processed = [
    f for f in os.listdir(batch_loc)
    if os.path.isfile(os.path.join(batch_loc, f)) and f.endswith('.jsonl')
]

files_processed = [
    f for f in os.listdir(batch_complete_loc)
    if os.path.isfile(os.path.join(batch_complete_loc, f)) and f.endswith('.jsonl')
]

print(f"Files to be processed: {len(files_to_be_processed)}")
print(f"Files complete: {len(files_processed)}")
print(f"Total Files: {len(files_processed) + len(files_to_be_processed)}")

Files to be processed: 115
Files complete: 105
Total Files: 220


In [66]:
# Function to process a single file with a custom description
def process_file(file_path, description):
    # Open the file and create a batch input file
    with open(file_path, "rb") as f:
        batch_input_file = client.files.create(file=f, purpose="batch")
    
    batch_input_file_id = batch_input_file.id

    # Create a batch job with the batch input file and custom description
    client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": description
        }
    )
    print(f"Processed file: {file_path} with description: {description}")

In [67]:
def process_all_files(batch_loc, batch_sent_loc, files, batch_size=5, wait_time=600):

    total_files = len(files)
    
    for i in range(0, total_files, batch_size):
        batch_files = files[i:i+batch_size]
        
        for file_name in batch_files:
            file_path = os.path.join(batch_loc, file_name)
            description = os.path.splitext(file_name)[0]
            process_file(file_path, description)
            
            # Move the processed file to batch_sent_loc directory
            shutil.move(file_path, os.path.join(batch_sent_loc, file_name))
            print(f"Moved file: {file_name} to {batch_sent_loc}")
        
        print(f"Processed {len(batch_files)} files. Waiting for {wait_time} seconds...")
        time.sleep(wait_time)  # Wait for the specified time before processing the next batch

    print("All files processed and moved successfully.")

In [68]:
def save_as_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')

In [69]:
process_all_files(batch_loc, batch_sent_loc, files_to_be_processed, batch_size=3, wait_time=600)

Processed file: /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_preprocessed/batch_helenbrown_text_1.jsonl with description: batch_helenbrown_text_1
Moved file: batch_helenbrown_text_1.jsonl to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/
Processed file: /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_preprocessed/batch_helenyemm_text_1.jsonl with description: batch_helenyemm_text_1
Moved file: batch_helenyemm_text_1.jsonl to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/
Processed file: /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_preprocessed/batch_helenyemm_text_3.jsonl with description: batch_helenyemm_text_3
Moved file: batch_helenyemm_text_3.jsonl to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/
Processed 3 files. Waiting for 600 seconds...
Proces

In [70]:
def fetch_batches(client, after=None):
    """
    Fetch batches from the client, optionally using an 'after' parameter to paginate.
    
    :param client: API client instance.
    :param after: The ID to paginate after.
    :return: List of batch data.
    """
    response = client.batches.list(limit=100, after=after)
    return response.data

def process_batches(client):
    """
    Process batches by fetching data, sorting, and paginating until no new data is retrieved.
    
    :param client: API client instance.
    :return: DataFrame containing all batch data.
    """
    batch_data = []
    after = None

    while True:
        # Fetch the current batch of data
        current_batches = fetch_batches(client, after)
        if not current_batches:
            print("No more batches to fetch.")
            break
        
        # Extract attributes and convert to DataFrame
        for batch in current_batches:
            batch_dict = {
                "id": batch.id,
                "completion_window": batch.completion_window,
                "created_at": batch.created_at,
                "endpoint": batch.endpoint,
                "input_file_id": batch.input_file_id,
                "object": batch.object,
                "status": batch.status,
                "cancelled_at": batch.cancelled_at,
                "cancelling_at": batch.cancelling_at,
                "completed_at": batch.completed_at,
                "error_file_id": batch.error_file_id,
                "errors": batch.errors,
                "expired_at": batch.expired_at,
                "expires_at": batch.expires_at,
                "failed_at": batch.failed_at,
                "finalizing_at": batch.finalizing_at,
                "in_progress_at": batch.in_progress_at,
                "metadata_description": batch.metadata.get('description', ''),
                "output_file_id": batch.output_file_id,
                "request_counts_completed": batch.request_counts.completed,
                "request_counts_failed": batch.request_counts.failed,
                "request_counts_total": batch.request_counts.total
            }
            batch_data.append(batch_dict)
        
        # Create a DataFrame from the current batch data
        batch_df = pd.DataFrame(batch_data)

        # Remove duplicate rows based on 'id'
        batch_df.drop_duplicates(subset='id', keep='last', inplace=True)
        
        # Sort DataFrame by 'created_at' column
        batch_df.sort_values(by='created_at', ascending=True, inplace=True)
        
        # Print the current DataFrame state for debugging
        print(f"Current DataFrame shape: {batch_df.shape}")
        
        # Update the 'after' parameter with the last batch ID for pagination
        last_batch_id = batch_df['id'].iloc[-1] if not batch_df.empty else None
        if last_batch_id == after:
            print("No new batches found.")
            break
        after = last_batch_id
    
    return batch_df

In [71]:
def fetch_batches(client, after=None):
    """
    Fetch batches from the client, optionally using an 'after' parameter to paginate.
    
    :param client: API client instance.
    :param after: The ID to paginate after.
    :return: List of batch data.
    """
    response = client.batches.list(limit=100, after=after)
    return response.data

def process_batches(client):
    """
    Process batches by fetching data, sorting, and paginating until no new data is retrieved.
    
    :param client: API client instance.
    :return: DataFrame containing all batch data.
    """
    all_batches = []  # This will collect all batch dictionaries
    after = None

    while True:
        # Fetch the current batch of data
        current_batches = fetch_batches(client, after)
        if not current_batches:
            print("No more batches to fetch.")
            break
        
        # Extract attributes and append to the main list
        for batch in current_batches:
            batch_dict = {
                "id": batch.id,
                "completion_window": batch.completion_window,
                "created_at": batch.created_at,
                "endpoint": batch.endpoint,
                "input_file_id": batch.input_file_id,
                "object": batch.object,
                "status": batch.status,
                "cancelled_at": batch.cancelled_at,
                "cancelling_at": batch.cancelling_at,
                "completed_at": batch.completed_at,
                "error_file_id": batch.error_file_id,
                "errors": batch.errors,
                "expired_at": batch.expired_at,
                "expires_at": batch.expires_at,
                "failed_at": batch.failed_at,
                "finalizing_at": batch.finalizing_at,
                "in_progress_at": batch.in_progress_at,
                "metadata_description": batch.metadata.get('description', ''),
                "output_file_id": batch.output_file_id,
                "request_counts_completed": batch.request_counts.completed,
                "request_counts_failed": batch.request_counts.failed,
                "request_counts_total": batch.request_counts.total
            }
            all_batches.append(batch_dict)
        
        # Update the 'after' parameter with the last batch ID for pagination
        last_batch_id = current_batches[-1].id if current_batches else None
        if last_batch_id == after:
            print("No new batches found.")
            break
        after = last_batch_id

    # Create a single DataFrame after collecting all batches
    batch_df = pd.DataFrame(all_batches)

    # Remove duplicate rows based on 'id'
    batch_df.drop_duplicates(subset='id', keep='last', inplace=True)
    
    # Sort DataFrame by 'created_at' column
    batch_df.sort_values(by='created_at', ascending=True, inplace=True)

    return batch_df


In [72]:
fetch_batches(client)

[Batch(id='batch_6778bdaceca881908231dd60ac546e6f', completion_window='24h', created_at=1735966125, endpoint='/v1/chat/completions', input_file_id='file-PqTfyspu9P3hg7UYqqJtrv', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-Sb2daX4hNaYB1SUdvsFFH7EZ. Limit: 2,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1736052525, failed_at=1735966125, finalizing_at=None, in_progress_at=None, metadata={'description': 'batch_clivejames_text_2'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)),
 Batch(id='batch_6778bb51155c819090eb546005419653', completion_window='24h', created_at=1735965521, endpoint='/v1/chat/completions', input_file_id='file-SmeCQNkw

In [73]:
batch_df = process_batches(client)

No more batches to fetch.


In [74]:
batch_df

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total
570,batch_676825247ad48190a6f08d4e76e89017,24h,1734878500,/v1/chat/completions,file-6fi5kfDUeao21Js4JR8b6e,batch,failed,,,,...,,1734964900,1.734878e+09,,,batch_The Telegraph_AdrianBridge,,0,0,0
569,batch_676825270fcc8190a8ac2038f6faebcd,24h,1734878503,/v1/chat/completions,file-5iFGuwEexjn54QSuodQjeq,batch,failed,,,,...,,1734964903,1.734879e+09,,,batch_The Telegraph_AislinnLaing,,0,0,0
568,batch_6768252a23488190af4daffcc4955131,24h,1734878506,/v1/chat/completions,file-PnkDD9DL3zB2MGjG9RoTDq,batch,failed,,,,...,,1734964906,1.734879e+09,,,batch_The Telegraph_AlanHansen,,0,0,0
567,batch_6768252dc88481908aa81f3aee15f22c,24h,1734878509,/v1/chat/completions,file-Apr6U4sjd4U2cBftVJz7DN,batch,failed,,,,...,,1734964909,1.734879e+09,,,batch_The Telegraph_AlanSmith,,0,0,0
566,batch_676825305b348190a3153dbb01be772c,24h,1734878512,/v1/chat/completions,file-SQX8GXzoqpKy2wWswojwxk,batch,failed,,,,...,,1734964912,1.734879e+09,,,batch_The Telegraph_AlanTitchmarsh,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,batch_6778b8f1963c8190a1a9486e2b09cb43,24h,1735964913,/v1/chat/completions,file-Wdngqpr2DRRQRCbeaVXj75,batch,failed,,,,...,,1736051313,1.735965e+09,,,batch_claireduffin_text_2,,0,0,0
3,batch_6778bb4c3ca0819081007036dbdee1d5,24h,1735965516,/v1/chat/completions,file-UQGgQzHHYGs5iJZiR3bKHX,batch,failed,,,,...,,1736051916,1.735966e+09,,,batch_claireduffin_text_3,,0,0,0
2,batch_6778bb4f1464819086be5d9bdd7a0fe0,24h,1735965519,/v1/chat/completions,file-CzWaqbEcNuQbf9k4rre5B4,batch,failed,,,,...,,1736051919,1.735966e+09,,,batch_cliveaslet_text_1,,0,0,0
1,batch_6778bb51155c819090eb546005419653,24h,1735965521,/v1/chat/completions,file-SmeCQNkwfwVK2ZUYNzx8s9,batch,completed,,,1.735966e+09,...,,1736051921,,1.735966e+09,1.735966e+09,batch_cliveaslet_text_3,file-K3YE81JzK7MTgXD6iK6CXb,220,0,220


In [75]:
failed_df = batch_df[batch_df['status']== 'failed']

In [76]:
completed_df = batch_df[batch_df['status']== 'completed']

In [77]:
in_progress_df = batch_df[batch_df['status']== 'in_progress']

In [78]:
print(f"Failed: {len(failed_df)} - Completed: {len(completed_df)} - In Progress {len(in_progress_df)}")

Failed: 466 - Completed: 105 - In Progress 0


In [79]:
in_progress_df

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total


In [80]:
completed_df

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total
460,batch_676875ee0be48190bffc2e78934bffee,24h,1734899182,/v1/chat/completions,file-8Qep8LF4HmtkJRkskshbL1,batch,completed,,,1.734909e+09,...,,1734985582,,1.734909e+09,1.734899e+09,batch_adrianbridge_text_1,file-19qKyHbN4tuMgXfHQi5s9w,380,0,380
459,batch_676875f1296c81909ddeb532ce4de20e,24h,1734899185,/v1/chat/completions,file-XKGGSKR7MuabVoLoTDAnJh,batch,completed,,,1.734909e+09,...,,1734985585,,1.734909e+09,1.734899e+09,batch_adrianbridge_text_2,file-9hdr8gKLqkwHPN4SmDAsf9,230,0,230
458,batch_676875f3ba5c8190bea3ac9d42fd3cdb,24h,1734899187,/v1/chat/completions,file-AJVucrCogsLi9q2c8Wu5Dn,batch,completed,,,1.734902e+09,...,,1734985587,,1.734902e+09,1.734899e+09,batch_aislinnlaing_text_1,file-MCGCkkYk46DAbgucQezq2e,150,0,150
457,batch_676875f524e88190a721fb66be28f4b1,24h,1734899189,/v1/chat/completions,file-RHEBxSMqnYS8aMPb2af8rH,batch,completed,,,1.734909e+09,...,,1734985589,,1.734909e+09,1.734899e+09,batch_aislinnlaing_text_2,file-1GEm5w318zftkda4uk5HZo,190,0,190
456,batch_676875f824308190ad32fdfb2068147e,24h,1734899192,/v1/chat/completions,file-EAxdwcvGhMEMeDHSu4w2nC,batch,completed,,,1.734909e+09,...,,1734985592,,1.734909e+09,1.734899e+09,batch_alanhansen_text_1,file-4SYJYZySQxbDGrjs5y3ywj,310,0,310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,batch_67789c5daf70819099ab56d59f346e4f,24h,1735957597,/v1/chat/completions,file-X9RbfYKHAq3DQi54Z3rxcQ,batch,completed,,,1.735959e+09,...,,1736043997,,1.735959e+09,1.735958e+09,batch_cameronmacphail_text_1,file-1175iJrjAtiW4bPTk9FdKe,1050,0,1050
10,batch_6778b42e57d88190ba0e3dbeeec62b98,24h,1735963694,/v1/chat/completions,file-GSBRAw4Bzg5SpGUsjai7Ng,batch,completed,,,1.735964e+09,...,,1736050094,,1.735964e+09,1.735964e+09,batch_christopherwilliams_text_1,file-UX4YaHJX9YMNtZFKVhGHHU,130,0,130
9,batch_6778b6898cf08190919b1f3caba56d78,24h,1735964297,/v1/chat/completions,file-J9EAjZcmhR8CASK4Z4aZi7,batch,completed,,,1.735965e+09,...,,1736050697,,1.735965e+09,1.735964e+09,batch_christopherwilliams_text_3,file-M4tUAAcYE7F5a2zSFctCV2,190,0,190
5,batch_6778b8eef6248190b4a5e7fb48f06102,24h,1735964911,/v1/chat/completions,file-U5ZrckUgRqUcE4Qn2YpU7E,batch,completed,,,1.735966e+09,...,,1736051311,,1.735966e+09,1.735965e+09,batch_clairecohen_text_3,file-KCgrvrNNhN4Vdxvkk4WBNJ,290,0,290


In [81]:
completed_df[completed_df['output_file_id'] == 'file-EVpH09R0q5PPlCo217pR5mXL'].columns

Index(['id', 'completion_window', 'created_at', 'endpoint', 'input_file_id',
       'object', 'status', 'cancelled_at', 'cancelling_at', 'completed_at',
       'error_file_id', 'errors', 'expired_at', 'expires_at', 'failed_at',
       'finalizing_at', 'in_progress_at', 'metadata_description',
       'output_file_id', 'request_counts_completed', 'request_counts_failed',
       'request_counts_total'],
      dtype='object')

In [82]:
# Define the function to process files
def process_batch_files(batch_complete_loc, batch_sent_loc, completed_df):
    # Define the function to extract the content from the response
    def extract_content(response):
        try:
            content = response['body']['choices'][0]['message']['content']
            return content
        except (KeyError, TypeError):
            return None

    # Get the list of files already in the batch_complete_loc
    existing_files = [f.replace('.jsonl', '') for f in os.listdir(batch_complete_loc) if f.endswith('.jsonl')]

    # Filter out rows in completed_df where metadata_description matches existing files
    df_to_process = completed_df[~completed_df['metadata_description'].isin(existing_files)]

    # Loop through the rows in the filtered DataFrame
    for index, row in df_to_process.iterrows():
        metadata_description = row['metadata_description']
        output_file_id = row['output_file_id']
        print(output_file_id)

        # Call the API to get the file content
        file_response = client.files.content(output_file_id)
        jsonl_io = StringIO(file_response.text)
        df = pd.read_json(jsonl_io, lines=True)

        # Apply the function to extract the 'content' from the 'response' column
        df['response'] = df['response'].apply(extract_content)

        # Select only the required columns
        df = df[['id', 'custom_id', 'response']]

        # Save the DataFrame as a jsonl file in batch_complete_loc
        output_filepath = os.path.join(batch_complete_loc, f"{metadata_description}.jsonl")
        df.to_json(output_filepath, orient='records', lines=True)

        # Move the file from batch_sent_loc to batch_complete_loc
        sent_filepath = os.path.join(batch_sent_loc, f"{metadata_description}.jsonl")
        if os.path.exists(sent_filepath):
            os.remove(sent_filepath)
            print(f"File {sent_filepath} moved to {batch_complete_loc}")

In [83]:
process_batch_files(batch_complete_loc, batch_sent_loc, completed_df)

file-3QZLvYNSTsAGM7EtGWA1Uh
File /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/batch_helenbrown_text_1.jsonl moved to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_complete/
file-LT2zpgoXBnxAWhkF1Dsj2q
File /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/batch_helenyemm_text_1.jsonl moved to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_complete/
file-Vij52HASdnWPmd6p8eUnVu
File /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/batch_helenyemm_text_3.jsonl moved to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_complete/
file-32DZphWVK5hqJgxLAQyUEF
File /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/batch_henrysamuel_text_2.jsonl moved to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence

In [84]:
failed_df

Unnamed: 0,id,completion_window,created_at,endpoint,input_file_id,object,status,cancelled_at,cancelling_at,completed_at,...,expired_at,expires_at,failed_at,finalizing_at,in_progress_at,metadata_description,output_file_id,request_counts_completed,request_counts_failed,request_counts_total
570,batch_676825247ad48190a6f08d4e76e89017,24h,1734878500,/v1/chat/completions,file-6fi5kfDUeao21Js4JR8b6e,batch,failed,,,,...,,1734964900,1.734878e+09,,,batch_The Telegraph_AdrianBridge,,0,0,0
569,batch_676825270fcc8190a8ac2038f6faebcd,24h,1734878503,/v1/chat/completions,file-5iFGuwEexjn54QSuodQjeq,batch,failed,,,,...,,1734964903,1.734879e+09,,,batch_The Telegraph_AislinnLaing,,0,0,0
568,batch_6768252a23488190af4daffcc4955131,24h,1734878506,/v1/chat/completions,file-PnkDD9DL3zB2MGjG9RoTDq,batch,failed,,,,...,,1734964906,1.734879e+09,,,batch_The Telegraph_AlanHansen,,0,0,0
567,batch_6768252dc88481908aa81f3aee15f22c,24h,1734878509,/v1/chat/completions,file-Apr6U4sjd4U2cBftVJz7DN,batch,failed,,,,...,,1734964909,1.734879e+09,,,batch_The Telegraph_AlanSmith,,0,0,0
566,batch_676825305b348190a3153dbb01be772c,24h,1734878512,/v1/chat/completions,file-SQX8GXzoqpKy2wWswojwxk,batch,failed,,,,...,,1734964912,1.734879e+09,,,batch_The Telegraph_AlanTitchmarsh,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,batch_6778b8eb67b08190bfda1a4c323e293f,24h,1735964907,/v1/chat/completions,file-6ZcnBw7LfYzRRj4PgGViJV,batch,failed,,,,...,,1736051307,1.735965e+09,,,batch_clairecohen_text_1,,0,0,0
4,batch_6778b8f1963c8190a1a9486e2b09cb43,24h,1735964913,/v1/chat/completions,file-Wdngqpr2DRRQRCbeaVXj75,batch,failed,,,,...,,1736051313,1.735965e+09,,,batch_claireduffin_text_2,,0,0,0
3,batch_6778bb4c3ca0819081007036dbdee1d5,24h,1735965516,/v1/chat/completions,file-UQGgQzHHYGs5iJZiR3bKHX,batch,failed,,,,...,,1736051916,1.735966e+09,,,batch_claireduffin_text_3,,0,0,0
2,batch_6778bb4f1464819086be5d9bdd7a0fe0,24h,1735965519,/v1/chat/completions,file-CzWaqbEcNuQbf9k4rre5B4,batch,failed,,,,...,,1736051919,1.735966e+09,,,batch_cliveaslet_text_1,,0,0,0


In [85]:
# Define the function to handle failed files
def handle_failed_files(batch_loc, batch_sent, failed_df):
    # Loop through the rows in the failed_df DataFrame
    for index, row in failed_df.iterrows():
        metadata_description = row['metadata_description']
        filename = f"{metadata_description}.jsonl"
        
        # Define the source and destination file paths
        sent_filepath = os.path.join(batch_sent, filename)
        loc_filepath = os.path.join(batch_loc, filename)
        
        # Check if the file exists in batch_sent
        if os.path.exists(sent_filepath):
            # Move the file to batch_loc
            shutil.move(sent_filepath, loc_filepath)
            print(f"File {filename} moved from {batch_sent} to {batch_loc}")

In [86]:
handle_failed_files(batch_loc, batch_sent_loc, failed_df)

File batch_alansmith_text_3.jsonl moved from /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/ to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_preprocessed/
File batch_alantitchmarsh_text_2.jsonl moved from /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/ to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_preprocessed/
File batch_alantovey_text_1.jsonl moved from /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/ to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_preprocessed/
File batch_alantovey_text_2.jsonl moved from /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_sent/ to /Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_preprocessed/
File batch_alasdairreid_text_3.jsonl moved from /Volumes/BC