In [94]:
import json
import os
import shutil
import glob
import time
import pywhatkit

import pandas as pd

from datetime import datetime
from io import StringIO
from openai import OpenAI

### Set Locations and Create Directories

In [109]:
credential_loc = "../../credentials.json"

data_type = "training"
corpus = "StackExchange"

base_loc = f"/Volumes/BCross/datasets/author_verification/"
data_loc = f"{base_loc}{data_type}/{corpus}/"
batch_loc = f"{data_loc}batch_sentence_preprocessed/"
raw_data_loc = f"{data_loc}known_raw.jsonl"

# Location for data when sent to batch
batch_sent_loc = f"{data_loc}batch_sentence_sent/"
os.makedirs(batch_sent_loc, exist_ok=True)

# Location once batch complete
batch_complete_loc = f"{data_loc}batch_sentence_complete/"
os.makedirs(batch_complete_loc, exist_ok=True)

# Location once batch complete
batch_fail_loc = f"{data_loc}batch_sentence_fail/"
os.makedirs(batch_fail_loc, exist_ok=True)

# Location to save the reasons for failure
batch_fail_reason_loc = f"{data_loc}batch_fail_reasons/"
os.makedirs(batch_fail_reason_loc, exist_ok=True)

# Phone number for WhatsApp notifications
phone_number = "+447756976114"

### Check the File Counts to be Processed

In [110]:
def list_folders_in_location(location):
    """Lists the names of folders within a given location."""
    if not os.path.exists(location):
        print(f"The location '{location}' does not exist.")
        return []

    # Get a list of all directories in the location
    return [folder for folder in os.listdir(location) if os.path.isdir(os.path.join(location, folder))]

def count_jsonl_files_in_dir(directory):
    """Counts .jsonl files in a directory and its subdirectories."""
    if not os.path.exists(directory):
        return 0
    count = 0
    for _, _, filenames in os.walk(directory):
        count += sum(1 for file in filenames if file.endswith('.jsonl'))
    return count

def list_corpus_file_counts(base_path):
    """Lists the number of .jsonl files for each corpus in the base path."""

    corpuses = list_folders_in_location(base_path)
    results = []
    for corpus in corpuses:
        batch_loc = os.path.join(base_path, corpus, "batch_sentence_preprocessed")
        jsonl_count = count_jsonl_files_in_dir(batch_loc)
        results.append((corpus, jsonl_count))
    return results

In [111]:
list_corpus_file_counts(f"{base_loc}{data_type}")

[('StackExchange', 225),
 ('Amazon', 4000),
 ('The Telegraph', 0),
 ('Yelp', 1760),
 ('Wiki', 300),
 ('All-the-news', 1776),
 ('IMDB', 711),
 ('Reddit', 2800),
 ("Koppel's Blogs", 2400),
 ('Perverted Justice', 296),
 ('TripAdvisor', 0),
 ('ACL', 279),
 ('The Apricity', 564),
 ('Enron', 0)]

### Initialise the OpenAI Client

In [112]:
with open(credential_loc, 'r') as f:
    data = json.load(f)
    
os.environ["OPENAI_API_KEY"] = data['OPENAI_API_KEY']

client = OpenAI()

### Helper Functions

In [113]:
def read_jsonl(file_path):
    """
    Reads a JSONL file and converts it into a pandas DataFrame.

    Parameters:
    - file_path: Path to the JSONL file to read.

    Returns:
    - A pandas DataFrame containing the data from the JSONL file.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

In [114]:
def write_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')

In [115]:
def send_whatsapp_notification(phone_number, message):
    pywhatkit.sendwhatmsg_instantly(phone_number, message, 10, tab_close=True)

In [116]:
def send_whatsapp_group_notification(group_name, message):
    pywhatkit.sendwhatmsg_to_group_instantly(group_name, message, 10, tab_close=True)

### File Number Checks

In [117]:
raw_df = read_jsonl(raw_data_loc)
print(f"Number of documents to process in raw data: {len(raw_df['doc_id'])}")

# List all .jsonl files in the batch_loc directory
files_to_be_processed = [
    f for f in os.listdir(batch_loc)
    if os.path.isfile(os.path.join(batch_loc, f)) and f.endswith('.jsonl')
]

files_processed = [
    f for f in os.listdir(batch_complete_loc)
    if os.path.isfile(os.path.join(batch_complete_loc, f)) and f.endswith('.jsonl')
]

files_failed = [
    f for f in os.listdir(batch_fail_loc)
    if os.path.isfile(os.path.join(batch_fail_loc, f)) and f.endswith('.jsonl')
]

print(f"Files to be processed in Batch location: {len(files_to_be_processed)}")
print(f"Files Complete in Batch location: {len(files_processed)}")
print(f"Files Failed in Batch location: {len(files_failed)}")
print(f"Total Files in Bath Location: {len(files_processed) + len(files_to_be_processed)}")

Number of documents to process in raw data: 75
Files to be processed in Batch location: 225
Files Complete in Batch location: 0
Files Failed in Batch location: 0
Total Files in Bath Location: 225


In [118]:
raw_df

Unnamed: 0,doc_id,text,corpus,author,texttype
0,known [10125604 stats] [ 5.48 kb].txt,If two of the independent variables are strong...,StackExchange,10125604,known
1,known [1017882 stats] [ 19.86 kb].txt,Suppose you and I are coaching track teams. Ou...,StackExchange,1017882,known
2,known [1024124 stats] [ 5.29 kb].txt,"To gain some more context, also read a statist...",StackExchange,1024124,known
3,known [1024210 stats] [ 4.32 kb].txt,"One way to do this, would be to remove the edg...",StackExchange,1024210,known
4,known [10248388 stats] [ 16.61 kb].txt,Saying that there is a conditional multimodal ...,StackExchange,10248388,known
...,...,...,...,...,...
70,known [2583847 stats] [ 19.68 kb].txt,The easiest way is to use the log link functio...,StackExchange,2583847,known
71,known [266065 stats] [ 11.67 kb].txt,I cannot give you an answer for the general ca...,StackExchange,266065,known
72,known [2663485 stats] [ 5.99 kb].txt,"Sinc we are discussing ROC curves, let me assu...",StackExchange,2663485,known
73,known [2710237 stats] [ 6.44 kb].txt,A couple of other packages to check out might ...,StackExchange,2710237,known


In [42]:
def create_batch_request(file_path, description, client, batch_sent_loc, return_file_id=True):

    
    # Open the file and create a batch input file
    
    with open(file_path, "rb") as f:
        batch_input_file = client.files.create(file=f, purpose="batch")
    
    batch_input_file_id = batch_input_file.id

    # Create a batch job with the batch input file and custom description
    client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": description
        }
    )

    shutil.move(file_path, os.path.join(batch_sent_loc, f"{description}.jsonl"))
    if return_file_id:
        return batch_input_file_id

In [43]:
def fetch_all_batch_info(client):
    """
    Fetch all batches from the client.
    
    :param client: API client instance.
    :return: List of batch data.
    """

    all_batches = []  # This will collect all batch dictionaries
    after = None

    while True:

        # Get the data from the batch api
        response = client.batches.list(limit=100, after=after)
        response_data = response.data

        if not response_data:
            break

        # Append the data into a single list
        all_batches.extend(response_data)

        
        last_batch_id = response_data[-1].id if response_data else None
        
        # Check whether there are more batches to find
        if last_batch_id == after:
            break
        after = last_batch_id
        
    return all_batches

In [44]:
def send_batch_request_and_wait(file_path, description, client, batch_sent_loc):
    """
    Sends a batch request, waits for it to complete, and monitors its status.

    Args:
        file_path (str): Path to the file to be processed.
        description (str): Description of the batch request.
        client (object): Client instance to interact with the API.

    Returns:
        str: Final status of the batch request.
    """
    # Send the batch request and store the file ID
    batch_file_id = create_batch_request(
        file_path, description, client, batch_sent_loc, return_file_id=True
    )

    wait_time = 0  # Initialize wait time

    while True:
        # Fetch all batch info
        batches = fetch_all_batch_info(client)

        # Filter batches by the input file ID
        filtered_batches = [batch for batch in batches if batch.input_file_id == batch_file_id]

        if not filtered_batches:
            print(f"    No matching batch found for file: {description}")
            break  # Exit if no matching batch is found

        current_status = filtered_batches[0].status

        print(f"    Current status: {current_status}. Wait time: {wait_time} seconds")

        # Break the loop if status is complete or failed
        if current_status in ('completed', 'failed'):
            output_file_id = filtered_batches[0].output_file_id
            return {
                "status": current_status,
                "output_file_id": output_file_id
            }

        # Wait before checking again
        time.sleep(30)
        wait_time += 30    

In [45]:
def process_file(file_path, description, client, batch_sent_loc, batch_complete_loc, batch_fail_loc):
    """Function to process a single batch file from start to finish"""
    
    def extract_content(response):
        """Function to get the file content if available"""
        try:
            content = response['body']['choices'][0]['message']['content']
            return content
        except (KeyError, TypeError):
            return None

    # The file path of the preprocessed files
    sent_filepath = os.path.join(batch_sent_loc, f"{description}.jsonl")

    # The output file id and status of the batch request once status completed of failed
    batch_output = send_batch_request_and_wait(file_path, description, client, batch_sent_loc)

    status = batch_output['status']
    output_file_id = batch_output['output_file_id']

    
    if status == 'failed':
        print(f"    File {description} Failed")

        # Create the filepath to move the file to failed location
        output_filepath = os.path.join(batch_fail_loc, f"{description}.jsonl")

        # Check if the file exists in batch_sent
        if os.path.exists(sent_filepath):
            # Move the file to batch_loc
            shutil.move(sent_filepath, output_filepath)
            print(f"File {description} moved from {batch_sent_loc} to {batch_loc}")
        

    elif status == 'completed':

        print(f"    File {description} Completed")

        # Get the output file and then save it to the completed location
        file_response = client.files.content(output_file_id)
        jsonl_io = StringIO(file_response.text)
        df = pd.read_json(jsonl_io, lines=True)

        # Apply the function to extract the 'content' from the 'response' column
        df['response'] = df['response'].apply(extract_content)

        # Select only the required columns
        df = df[['id', 'custom_id', 'response']]

        # Save the DataFrame as a jsonl file in batch_complete_loc
        output_filepath = os.path.join(batch_complete_loc, f"{description}.jsonl")
        df.to_json(output_filepath, orient='records', lines=True)

        # Remove the old file
        if os.path.exists(sent_filepath):
            os.remove(sent_filepath)
            print(f"    File {description} moved to {batch_complete_loc}")


    else:
        print("Check status further")

In [46]:
def process_all_files(client, batch_loc, batch_sent_loc, batch_complete_loc,
                      batch_fail_loc, send_whatsapp=False, phone_number="+447756976114"):

    # Get a list of all files in the batch loc
    files_to_be_processed = [
        f for f in os.listdir(batch_loc)
        if os.path.isfile(os.path.join(batch_loc, f)) and f.endswith('.jsonl')
    ]

    total_files = len(files_to_be_processed)
    start_time = time.time()

    # Process all files in the list
    for idx, file in enumerate(files_to_be_processed, start=1):
        file_start_time = time.time()
        file_path = os.path.join(batch_loc, file)
        description = os.path.splitext(file)[0]
        print(f"Processing File {idx} of {total_files}: {description}")

        process_file(file_path, description, client, batch_sent_loc, batch_complete_loc, batch_fail_loc)

        # Calculate time taken for the file
        file_time_taken = time.time() - file_start_time

        # Generate and send the message for the current file
        message = f"File {idx} of {total_files} processed - {description} - Time taken {int(file_time_taken)}s"

        if send_whatsapp:
            send_whatsapp_notification(phone_number, message)

    # Calculate total processing time
    total_time_taken = time.time() - start_time

    # Send the final message
    final_message = f"All files processed, time taken {int(total_time_taken)}s"

    if send_whatsapp:
        send_whatsapp_notification(phone_number, final_message)
        # send_whatsapp_group_notification("Python Notifications", final_message)


In [47]:
def process_all_failed_batches(client, batch_fail_loc, save_location):
    results = []
    
    # Step 1: Get the list of failed files
    failed_files = [
        f for f in os.listdir(batch_fail_loc)
        if os.path.isfile(os.path.join(batch_fail_loc, f)) and f.endswith('.jsonl')
    ]
    
    if not failed_files:
        print("No failed files found in the specified location.")
    
    # Step 2: Fetch all batch file information from the client
    batch_file_info = fetch_all_batch_info(client)
    
    # Step 3: Process each failed file
    for failed_file in failed_files:
        # Remove the .jsonl extension from the file name
        file_name_without_ext = failed_file.split('.jsonl')[0]
        
        # Filter for batches matching the failed file
        filtered_batches = [
            batch for batch in batch_file_info 
            if batch.metadata['description'] == file_name_without_ext
        ]
        
        # Process each filtered batch
        for batch in filtered_batches:
            try:
                created_at = datetime.fromtimestamp(batch.created_at)
                failed_at = datetime.fromtimestamp(batch.failed_at)
                error_code = batch.errors.data[0].code
                error_message = batch.errors.data[0].message
                batch_id = batch.id
                input_file_id = batch.input_file_id
                
                result = {
                    'batch_id': batch_id,
                    'input_file_id': input_file_id,
                    'file_name': file_name_without_ext,
                    'created_at': created_at,
                    'failed_at': failed_at,
                    'error_code': error_code,
                    'error_message': error_message
                }
                results.append(result)
            except (KeyError, IndexError, ValueError) as e:
                # Handle cases where the expected fields are missing or invalid
                print(f"Error processing batch for file {file_name_without_ext}: {e}")
    
    # Step 4: Save the results to the specified save location
    save_path = os.path.join(save_location, f"processed_failed_batches_{datetime.now().strftime('%Y%m%d%H%M%S')}.json")

    if len(results) > 0:

        # Save to a dataframe and then alter timestamps to save as jsonl
        df = pd.DataFrame(results)
        df['created_at'] = df['created_at'].dt.strftime('%Y-%m-%dT%H:%M:%S')
        df['failed_at'] = df['failed_at'].dt.strftime('%Y-%m-%dT%H:%M:%S')
        write_jsonl(df, save_path)

In [49]:
process_all_files(client, batch_loc, batch_sent_loc, batch_complete_loc, batch_fail_loc)

Processing File 1 of 22: batch_jeff_skilling_mail_5
    Current status: validating. Wait time: 0 seconds
    Current status: in_progress. Wait time: 30 seconds
    Current status: in_progress. Wait time: 60 seconds
    Current status: in_progress. Wait time: 90 seconds
    Current status: in_progress. Wait time: 120 seconds
    Current status: in_progress. Wait time: 150 seconds
    Current status: in_progress. Wait time: 180 seconds
    Current status: in_progress. Wait time: 210 seconds
    Current status: in_progress. Wait time: 240 seconds
    Current status: in_progress. Wait time: 270 seconds
    Current status: in_progress. Wait time: 300 seconds
    Current status: in_progress. Wait time: 330 seconds
    Current status: in_progress. Wait time: 360 seconds
    Current status: in_progress. Wait time: 390 seconds
    Current status: in_progress. Wait time: 420 seconds
    Current status: in_progress. Wait time: 450 seconds
    Current status: in_progress. Wait time: 480 seconds
  

In [50]:
process_all_failed_batches(client, batch_fail_loc, batch_fail_reason_loc)