# Improved Parallel De-identification with Ollama (Part 2)

This notebook is a continuation of `improved_parallel_deid.ipynb`. It contains the remaining implementation of the main processing function and execution code.

In [None]:
# Import the functions from the first notebook
%run improved_parallel_deid.ipynb

## Complete Main Processing Function

This completes the implementation of the `process_large_csv` function from the first notebook.

In [None]:
def process_large_csv_complete(input_path, output_prefix, columns_to_clean):
    """
    Reads a potentially large CSV, splits it into batches, de-identifies the 
    specified columns, and saves new numbered CSV files for each batch.
    Uses the specified implementation approach for concurrency control.
    """
    if not os.path.exists(input_path) or input_path == "/path/to/your/file.csv":
        print("ERROR: Input file not found or path not set.")
        print(f"Please update the 'INPUT_CSV_PATH' variable in the Configuration section.")
        return
    
    # Get the directory of the input file to save output files in the same location
    output_dir = os.path.dirname(os.path.abspath(input_path))
    
    try:
        print(f"Reading and preparing CSV from '{input_path}'...")
        df_iterator = pd.read_csv(input_path, chunksize=MAX_ROWS_PER_BATCH, on_bad_lines='warn')
        with open(input_path, 'r', encoding='utf-8', errors='ignore') as f:
            total_rows = sum(1 for row in f) - 1 # -1 for header
        num_batches = math.ceil(total_rows / MAX_ROWS_PER_BATCH)

    except Exception as e:
        print(f"Error reading input file: {e}")
        return

    print(f"Total rows: {total_rows}. This will be processed in {num_batches} batch(es).")
    print(f"Output files will be saved in: {output_dir}")
    print(f"Each text will undergo {DEIDENTIFICATION_PASSES} pass(es) through the LLM.")
    print(f"Using implementation approach: {IMPLEMENTATION_APPROACH}")
    print(f"Maximum concurrent requests to Ollama: {MAX_CONCURRENT_REQUESTS}")
    
    # Track processed batches for summary
    processed_batches = []
    skipped_batches = []

    for i, batch_df in enumerate(df_iterator):
        batch_num = i + 1
        output_path = os.path.join(output_dir, f"{output_prefix}_part_{batch_num}.csv")
        
        if os.path.exists(output_path):
            print(f"\nOutput file '{output_path}' already exists. Skipping Batch {batch_num}.")
            skipped_batches.append(batch_num)
            continue

        print(f"\n--- Processing Batch {batch_num}/{num_batches} ---")
        
        for column_name in columns_to_clean:
            if column_name not in batch_df.columns:
                print(f"  - WARNING: Column '{column_name}' not found in this batch. Skipping.")
                continue
            
            print(f"  - De-identifying column: '{column_name}' using {IMPLEMENTATION_APPROACH} approach")
            
            total_in_batch = len(batch_df)
            
            # Prepare data for processing: (row_index, text_content)
            row_data = [(idx, row[column_name]) for idx, row in batch_df.iterrows()]
            
            # Choose the appropriate processing method based on the implementation approach
            if IMPLEMENTATION_APPROACH == "queue":
                processed_results = process_with_queue(row_data, total_in_batch)
            elif IMPLEMENTATION_APPROACH == "process_pool":
                processed_results = process_with_process_pool(row_data, total_in_batch)
            else:  # Default to ThreadPoolExecutor with semaphore or rate limiting
                # Initialize results dictionary to maintain order
                processed_results = {}
                completed_count = 0
                
                # Use ThreadPoolExecutor for parallel processing
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
                    # Submit all tasks
                    future_to_index = {executor.submit(process_row_parallel, data): data[0] for data in row_data}
                    
                    # Process completed tasks as they finish
                    for future in as_completed(future_to_index):
                        original_index, processed_text = future.result()
                        processed_results[original_index] = processed_text
                        completed_count += 1
                        
                        # Print progress
                        print(f"\r    - Row {completed_count}/{total_in_batch}", end="")
            
            print()  # Newline after the progress bar for a column is complete
            
            # Reconstruct the processed data in the original order
            processed_data = [processed_results[idx] for idx, _ in batch_df.iterrows()]
            
            # Update the dataframe with processed data
            if REPLACE_ORIGINAL_COLUMN:
                batch_df[column_name] = processed_data
            else:
                batch_df[f"{column_name}_deidentified"] = processed_data
        
        # Save the processed batch to CSV
        try:
            batch_df.to_csv(output_path, index=False)
            print(f"  - Saved batch to: '{output_path}'")
            processed_batches.append(batch_num)
        except Exception as e:
            print(f"  - ERROR: Could not save batch {batch_num}: {e}")
    
    print(f"\n--- Processing Complete ---")
    print(f"All batches have been processed and saved with prefix '{output_prefix}'.")
    
    # Print summary of processed and skipped batches
    if processed_batches:
        print(f"\nProcessed batches: {', '.join(map(str, processed_batches))}")
    if skipped_batches:
        print(f"Skipped batches (already existed): {', '.join(map(str, skipped_batches))}")
    
    # Print the location of the output files
    print(f"\nOutput files are located in: {output_dir}")
    print(f"File naming pattern: {output_prefix}_part_X.csv where X is the batch number")
    
    # List the output files that exist
    existing_output_files = [f for f in os.listdir(output_dir) if f.startswith(output_prefix) and f.endswith('.csv')]
    if existing_output_files:
        print(f"\nFound {len(existing_output_files)} output files:")
        for file in sorted(existing_output_files):
            file_path = os.path.join(output_dir, file)
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB
            print(f"  - {file} ({file_size:.2f} MB)")
    else:
        print(f"\nNo output files found with prefix '{output_prefix}' in {output_dir}")

## 6. Run the Process

Execute the main function. This will start the process using the file and columns you specified in the Configuration section. If you run this cell a second time, it will find the generated output files and skip the processing.

In [None]:
# This cell runs the main function with the settings you provided in the first notebook.
process_large_csv_complete(
    input_path=INPUT_CSV_PATH,
    output_prefix=OUTPUT_PREFIX, 
    columns_to_clean=COLUMNS_TO_CLEAN
)

## 7. Troubleshooting

If you're still experiencing issues with Ollama and parallel processing, here are some additional approaches to try:

1. **Further reduce concurrency**: Try setting `MAX_CONCURRENT_REQUESTS` to 1 or 2
2. **Increase delay between requests**: Modify the rate limiter to add more delay
3. **Use a different model**: Some models may handle concurrent requests better than others
4. **Run Ollama with more resources**: If possible, allocate more CPU/memory to Ollama
5. **Use a different implementation approach**: Try each of the approaches (semaphore, rate_limit, queue, process_pool) to see which works best for your system

You can also try running Ollama with the `--parallel` flag if available in your version, which may improve handling of concurrent requests.

In [None]:
# Example of modifying settings for troubleshooting
def troubleshoot_with_minimal_concurrency():
    global MAX_CONCURRENT_REQUESTS, RATE_LIMIT_CALLS, RATE_LIMIT_PERIOD
    
    # Save original values
    original_max_concurrent = MAX_CONCURRENT_REQUESTS
    original_rate_limit_calls = RATE_LIMIT_CALLS
    original_rate_limit_period = RATE_LIMIT_PERIOD
    
    # Apply minimal concurrency settings
    MAX_CONCURRENT_REQUESTS = 1
    RATE_LIMIT_CALLS = 1
    RATE_LIMIT_PERIOD = 2  # 1 call every 2 seconds
    
    # Recreate the semaphore and rate limiter with new settings
    global api_semaphore, rate_limiter
    api_semaphore = threading.Semaphore(MAX_CONCURRENT_REQUESTS)
    rate_limiter = RateLimiter(RATE_LIMIT_CALLS, RATE_LIMIT_PERIOD)
    
    print(f"Applied minimal concurrency settings:")
    print(f"  - MAX_CONCURRENT_REQUESTS: {MAX_CONCURRENT_REQUESTS} (was {original_max_concurrent})")
    print(f"  - RATE_LIMIT_CALLS: {RATE_LIMIT_CALLS} (was {original_rate_limit_calls})")
    print(f"  - RATE_LIMIT_PERIOD: {RATE_LIMIT_PERIOD} (was {original_rate_limit_period})")
    print(f"\nNow you can run the process_large_csv_complete function again with these settings.")

In [None]:
# Uncomment and run this cell if you're still experiencing issues
# troubleshoot_with_minimal_concurrency()