## Batch calls  

In [7]:
import os
import json
from datetime import datetime
from openai import OpenAI
import pandas as pd
import numpy as np
from prompt_template_improved import get_template


def prepare_call_for_single_line(context, quasisentence, template):
    systemmessae = f"""Use the provided category scheme to classify the sentence. Provide only the code and nothing else."""
    newline = '\n'
    context_formatted =f'{newline}## Context (for reference, do not code this):<s>{context}</s>{newline}'
    quasisentence_formatted = f'Code the following: {quasisentence}'
    content_full= systemmessae + context_formatted + quasisentence_formatted

    systemmessae= get_template(template)

    return systemmessae, content_full  

def create_batch_file(df, instruction, party_name, DAYTIME):
    """Generate JSONL batch file from dataframe"""
    batch_requests = []
    
    for index, row in df.iterrows():
        quasisentence = row['text_en']
        context = row['context']
        systemmessage, content = prepare_call_for_single_line(
            context, quasisentence, template=instruction
        )
        
        # Create batch request format
        request = {
            "custom_id": f"{party_name}_row_{index}",  # Unique ID to map results back
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4.1-mini-2025-04-14",
                "messages": [
                    {"role": "system", "content": systemmessage},
                    {"role": "user", "content": content}
                ],
                "temperature": 0
            }
        }
        batch_requests.append(request)
    
    # Write to JSONL file
    batch_file_path = f"batches/batch_{party_name}_{instruction}_{DAYTIME}.jsonl"
    os.makedirs("batches", exist_ok=True)
    
    with open(batch_file_path, 'w') as f:
        for req in batch_requests:
            f.write(json.dumps(req) + '\n')
    
    return batch_file_path

In [24]:
def submit_batch(batch_file_path, instruction, party_name):
    """Upload file and create batch job"""
    client = OpenAI()
    
    # Upload the batch file
    with open(batch_file_path, "rb") as f:
        batch_input_file = client.files.create(
            file=f,
            purpose="batch"
        )
    
    # Create batch job
    batch = client.batches.create(
        input_file_id=batch_input_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "instruction": instruction,
            "party": party_name
        }
    )
    
    print(f"Batch created: {batch.id}")
    return batch.id

In [25]:
def load_batch_results(batch_id, df, results_file_path):
    """Check batch status and load results when complete"""
    client = OpenAI()
    
    # Check batch status
    batch = client.batches.retrieve(batch_id)
    print(f"Batch status: {batch.status}")
    
    if batch.status != "completed":
        print(f"Batch not ready. Status: {batch.status}")
        return False
    
    # Download results
    file_response = client.files.content(batch.output_file_id)
    results = file_response.text.strip().split('\n')
    
    # Parse results and map back to dataframe
    for line in results:
        result = json.loads(line)
        custom_id = result['custom_id']
        
        # Extract row index from custom_id (format: "partyname_row_123")
        row_index = int(custom_id.split('_row_')[1])
        
        # Get the response content
        if result['response']['status_code'] == 200:
            content = result['response']['body']['choices'][0]['message']['content']
            list_of_annotations = content.split('\n')
            
            # Save to dataframe (adapt your save_predictions logic)
            df.at[row_index, 'annotations'] = list_of_annotations
        else:
            df.at[row_index, 'annotations'] = "Error"
    
    # Save results
    df.to_csv(results_file_path, index=False)
    print(f"Results saved to {results_file_path}")
    return True

In [26]:
DAYTIME = datetime.now().strftime("%Y%m%d_%H%M%S")
client = OpenAI()

instruction_templates = ['llm_optimized_1',
                         # 'cot',
                         #   'cot-1'
                            ]
paths = ['data/contextaware_data_20250927_112401/afd_context_20250927_112401.csv',
'data/contextaware_data_20250927_112401/bsw_context_20250927_112401.csv',
'data/contextaware_data_20250927_112401/cdu_context_20250927_112401.csv',
'data/contextaware_data_20250927_112401/fdp_context_20250927_112401.csv',
'data/contextaware_data_20250927_112401/gruene_context_20250927_112401.csv',
'data/contextaware_data_20250927_112401/linke_context_20250927_112401.csv',
'data/contextaware_data_20250927_112401/spd_context_20250927_112401.csv',
'data/contextaware_data_20250927_112401/sswb_context_20250927_112401.csv']

batch_jobs = {}  # Track all batch jobs

for instruction in instruction_templates:
    basepath = f"data/annotated_data/run_{instruction}_{DAYTIME}/"
    os.makedirs(basepath, exist_ok=True)
    
    for path in paths:
        party_name = os.path.splitext(os.path.basename(path))[0]
        results_file_path = f"{basepath}{party_name}_annotated_{DAYTIME}.csv"
        
        # Load data
        df = pd.read_csv(path)
        df['annotations'] = len(df) * [np.nan]
        df['annotations'] = df['annotations'].astype('object')
        
        # Create batch file
        batch_file = create_batch_file(df, instruction, party_name, DAYTIME)
        
        # Submit batch
        batch_id = submit_batch(batch_file, instruction, party_name)
        
        # Store for later retrieval
        batch_jobs[batch_id] = {
            'df': df,
            'results_path': results_file_path,
            'party': party_name,
            'instruction': instruction
        }

# Save batch job info for later retrieval
with open(f'batch_jobs_{DAYTIME}.json', 'w') as f:
    json.dump({k: {'results_path': v['results_path'], 
                   'party': v['party'], 
                   'instruction': v['instruction']} 
               for k, v in batch_jobs.items()}, f)

print("\nAll batches submitted! Use the retrieval script to check status and download results.")

Batch created: batch_68dbce3ed84881909c9ed03663523e79
Batch created: batch_68dbce4088a08190b35c7ae2884916e1
Batch created: batch_68dbce43fde88190905beabd1686b8ed
Batch created: batch_68dbce469fd881909e67fea139a5f01d
Batch created: batch_68dbce54f8a08190ac88a6c40aba9cf0
Batch created: batch_68dbce599b4081908daf35e778368d67
Batch created: batch_68dbce5ee9bc8190977449e043896a4f
Batch created: batch_68dbce64258c8190940d7e914c01c538

All batches submitted! Use the retrieval script to check status and download results.


In [37]:
batches = client.batches.list(limit=20)

for batch in batches.data:
    print(f"{batch.id}: {batch.status} - {batch.metadata}")

batch_68dbce64258c8190940d7e914c01c538: completed - {'instruction': 'llm_optimized_1', 'party': 'sswb_context_20250927_112401'}
batch_68dbce5ee9bc8190977449e043896a4f: completed - {'instruction': 'llm_optimized_1', 'party': 'spd_context_20250927_112401'}
batch_68dbce599b4081908daf35e778368d67: completed - {'instruction': 'llm_optimized_1', 'party': 'linke_context_20250927_112401'}
batch_68dbce54f8a08190ac88a6c40aba9cf0: completed - {'instruction': 'llm_optimized_1', 'party': 'gruene_context_20250927_112401'}
batch_68dbce469fd881909e67fea139a5f01d: completed - {'instruction': 'llm_optimized_1', 'party': 'fdp_context_20250927_112401'}
batch_68dbce43fde88190905beabd1686b8ed: completed - {'instruction': 'llm_optimized_1', 'party': 'cdu_context_20250927_112401'}
batch_68dbce4088a08190b35c7ae2884916e1: completed - {'instruction': 'llm_optimized_1', 'party': 'bsw_context_20250927_112401'}
batch_68dbce3ed84881909c9ed03663523e79: completed - {'instruction': 'llm_optimized_1', 'party': 'afd_cont

In [39]:
import json
import pandas as pd
from openai import OpenAI
import os

def retrieve_all_batches(batch_jobs_info):
    """
    Retrieve and process all completed batches
    
    Args:
        batch_jobs_info: Dictionary with batch_id as keys and metadata as values
    """
    client = OpenAI()
    
    for batch_id, metadata in batch_jobs_info.items():
        print(f"\nProcessing batch: {batch_id}")
        print(f"  Party: {metadata['party']}")
        print(f"  Instruction: {metadata['instruction']}")
        
        try:
            # Get batch status
            batch = client.batches.retrieve(batch_id)
            print(f"  Status: {batch.status}")
            
            if batch.status != "completed":
                print(f"  ⏳ Skipping - not completed yet")
                continue
            
            # Download results
            print(f"  Downloading results...")
            file_response = client.files.content(batch.output_file_id)
            results_lines = file_response.text.strip().split('\n')
            
            # Parse results
            results_dict = {}
            for line in results_lines:
                result = json.loads(line)
                custom_id = result['custom_id']
                
                # Extract row index from custom_id
                row_index = int(custom_id.split('_row_')[1])
                
                # Get the response content
                if result['response']['status_code'] == 200:
                    content = result['response']['body']['choices'][0]['message']['content']
                    results_dict[row_index] = content
                else:
                    error = result.get('error', 'Unknown error')
                    results_dict[row_index] = f"ERROR: {error}"
                    print(f"  ⚠️  Error in row {row_index}: {error}")
            
            print(f"  ✓ Retrieved {len(results_dict)} results")
            
            # You'll need to specify where your original CSV files are
            # and where to save the annotated results
            party_name = metadata['party'].replace('_context_20250927_112401', '')
            instruction = metadata['instruction']
            
            # Load original CSV
            original_path = f"data/{party_name}.csv"
            if not os.path.exists(original_path):
                print(f"  ⚠️  Original file not found: {original_path}")
                continue
                
            df = pd.read_csv(original_path)
            
            # Add annotations column if it doesn't exist
            if 'annotations' not in df.columns:
                df['annotations'] = pd.NA
                df['annotations'] = df['annotations'].astype('object')
            
            # Fill in the annotations
            for row_idx, annotation in results_dict.items():
                if row_idx < len(df):
                    df.at[row_idx, 'annotations'] = annotation
            
            # Save results
            output_dir = f"data/annotated_data/run_{instruction}_batch/"
            os.makedirs(output_dir, exist_ok=True)
            output_path = f"{output_dir}{party_name}_annotated.csv"
            df.to_csv(output_path, index=False)
            
            print(f"  ✓ Saved to: {output_path}")
            
        except Exception as e:
            print(f"  ❌ Error processing batch {batch_id}: {e}")
            continue

# Usage with your completed batches
batch_jobs = {
    'batch_68dbce64258c8190940d7e914c01c538': {'instruction': 'llm_optimized_1', 'party': 'sswb_context_20250927_112401'},
    'batch_68dbce5ee9bc8190977449e043896a4f': {'instruction': 'llm_optimized_1', 'party': 'spd_context_20250927_112401'},
    'batch_68dbce599b4081908daf35e778368d67': {'instruction': 'llm_optimized_1', 'party': 'linke_context_20250927_112401'},
    'batch_68dbce54f8a08190ac88a6c40aba9cf0': {'instruction': 'llm_optimized_1', 'party': 'gruene_context_20250927_112401'},
    'batch_68dbce469fd881909e67fea139a5f01d': {'instruction': 'llm_optimized_1', 'party': 'fdp_context_20250927_112401'},
    'batch_68dbce43fde88190905beabd1686b8ed': {'instruction': 'llm_optimized_1', 'party': 'cdu_context_20250927_112401'},
    'batch_68dbce4088a08190b35c7ae2884916e1': {'instruction': 'llm_optimized_1', 'party': 'bsw_context_20250927_112401'},
    'batch_68dbce3ed84881909c9ed03663523e79': {'instruction': 'llm_optimized_1', 'party': 'afd_context_20250927_112401'},
}

# Run the retrieval
retrieve_all_batches(batch_jobs)


Processing batch: batch_68dbce64258c8190940d7e914c01c538
  Party: sswb_context_20250927_112401
  Instruction: llm_optimized_1
  Status: completed
  Downloading results...
  ✓ Retrieved 2336 results
  ✓ Saved to: data/annotated_data/run_llm_optimized_1_batch/sswb_annotated.csv

Processing batch: batch_68dbce5ee9bc8190977449e043896a4f
  Party: spd_context_20250927_112401
  Instruction: llm_optimized_1
  Status: completed
  Downloading results...
  ✓ Retrieved 2140 results
  ✓ Saved to: data/annotated_data/run_llm_optimized_1_batch/spd_annotated.csv

Processing batch: batch_68dbce599b4081908daf35e778368d67
  Party: linke_context_20250927_112401
  Instruction: llm_optimized_1
  Status: completed
  Downloading results...
  ✓ Retrieved 2449 results
  ✓ Saved to: data/annotated_data/run_llm_optimized_1_batch/linke_annotated.csv

Processing batch: batch_68dbce54f8a08190ac88a6c40aba9cf0
  Party: gruene_context_20250927_112401
  Instruction: llm_optimized_1
  Status: completed
  Downloading res

In [41]:
def flatten_annotations(df):
    """
    Add flattened JSON columns to existing dataframe
    """
    # Parse annotations and extract fields
    def extract_field(annotation_json, field):
        try:
            data = json.loads(annotation_json)
            return data.get(field, None)
        except:
            return None
    
    # Add new columns
    df['quasi_sentence'] = df['annotations'].apply(lambda x: extract_field(x, 'quasi_sentence'))
    df['reasoning'] = df['annotations'].apply(lambda x: extract_field(x, 'reasoning'))
    df['category'] = df['annotations'].apply(lambda x: extract_field(x, 'category'))
    
    return df

import glob

input_dir = "data/annotated_data/run_llm_optimized_1_batch/"
output_dir = "data/annotated_data/run_llm_optimized_1_batch_flattened/"
os.makedirs(output_dir, exist_ok=True)

# Process all CSV files
for file_path in glob.glob(f"{input_dir}*.csv"):
    filename = os.path.basename(file_path)
    print(f"Processing {filename}...")
    
    df = pd.read_csv(file_path)
    df_flattened = flatten_annotations(df)
    
    output_path = f"{output_dir}{filename}"
    df_flattened.to_csv(output_path, index=False)
    print(f"  ✓ Saved to {output_path}")

print("\nDone! All files flattened.")

Processing gruene_annotated.csv...
  ✓ Saved to data/annotated_data/run_llm_optimized_1_batch_flattened/gruene_annotated.csv
Processing linke_annotated.csv...
  ✓ Saved to data/annotated_data/run_llm_optimized_1_batch_flattened/linke_annotated.csv
Processing cdu_annotated.csv...
  ✓ Saved to data/annotated_data/run_llm_optimized_1_batch_flattened/cdu_annotated.csv
Processing bsw_annotated.csv...
  ✓ Saved to data/annotated_data/run_llm_optimized_1_batch_flattened/bsw_annotated.csv
Processing spd_annotated.csv...
  ✓ Saved to data/annotated_data/run_llm_optimized_1_batch_flattened/spd_annotated.csv
Processing fdp_annotated.csv...
  ✓ Saved to data/annotated_data/run_llm_optimized_1_batch_flattened/fdp_annotated.csv
Processing sswb_annotated.csv...
  ✓ Saved to data/annotated_data/run_llm_optimized_1_batch_flattened/sswb_annotated.csv
Processing afd_annotated.csv...
  ✓ Saved to data/annotated_data/run_llm_optimized_1_batch_flattened/afd_annotated.csv

Done! All files flattened.
