### Initially created a solution using pandas and other libraries to prototype the approach. Once the solution was working, we re-implemented it using Spark. 
##### Tested also the concept of batches and its performance optimizations.  

In [4]:
import pandas as pd
from datetime import datetime
import time
from tqdm import tqdm
import numpy as np

def process_batch(batch_df):
    """Process a single batch of data"""
    # Extract salary range into min and max
    def extract_salary_range(salary_str):
        try:
            salary_str = str(salary_str).replace('$', '').replace('£', '').replace('€', '')
            parts = salary_str.replace(',', '').split('-')
            if len(parts) == 2:
                return pd.Series([float(parts[0].strip().replace('K', '000')), 
                                float(parts[1].strip().replace('K', '000'))])
            else:
                return pd.Series([None, None])
        except:
            return pd.Series([None, None])
    
    # Process salary ranges
    batch_df[['min_salary', 'max_salary']] = batch_df['Salary Range'].apply(extract_salary_range)
    
    # Create new dataframe with selected columns
    new_batch = pd.DataFrame({
        'job_description': batch_df['Job Description'],
        'job_skills': batch_df['skills'],
        'job_industry': batch_df['Company Profile'],
        'job_location': batch_df['location'] + ', ' + batch_df['Country'],
        'experience_level': batch_df['Experience'],
        'min_salary': batch_df['min_salary'],
        'max_salary': batch_df['max_salary'],
        'job_posting_date': pd.to_datetime(batch_df['Job Posting Date']).dt.date
    })
    
    # Clean up missing values
    new_batch = new_batch.replace('', pd.NA)
    new_batch = new_batch.replace('NaN', pd.NA)
    
    return new_batch

def process_csv(input_file, output_file, batch_size=1000):
    """
    Process CSV file in batches to select and rename specific columns.
    
    Parameters:
    input_file (str): Path to input CSV file
    output_file (str): Path to output CSV file
    batch_size (int): Number of rows to process in each batch
    """
    print(f"\n Starting CSV processing at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f" Reading file: {input_file}")
    
    try:
        # Count total rows first
        total_rows = sum(1 for _ in open(input_file)) - 1  # subtract header row
        print(f" Total rows to process: {total_rows:,}")
        
        # Process in batches
        processed_dfs = []
        batch_count = 0
        
        # Create a chunked iterator
        chunks = pd.read_csv(input_file, chunksize=batch_size)
        
        # Calculate number of batches for progress bar
        n_batches = int(np.ceil(total_rows / batch_size))
        
        print("\n Processing batches:")
        # Process each batch with progress bar
        for chunk in tqdm(chunks, total=n_batches, desc="Processing"):
            processed_batch = process_batch(chunk)
            processed_dfs.append(processed_batch)
            batch_count += 1
            
            # Print batch statistics every 5 batches
            if batch_count % 5 == 0:
                print(f"\n Processed {batch_count * batch_size:,} rows...")
        
        print("\n Combining all batches...")
        final_df = pd.concat(processed_dfs, ignore_index=True)
        
        # Print final statistics
        print("\n Processing Statistics:")
        print(f"Total records processed: {len(final_df):,}")
        print(f"Records with valid salary range: {final_df['min_salary'].notna().sum():,}")
        print(f"Unique locations: {final_df['job_location'].nunique():,}")
        print(f"Date range: {final_df['job_posting_date'].min()} to {final_df['job_posting_date'].max()}")
        
        print(f"\n Saving processed data to {output_file}")
        # Save to new CSV file
        final_df.to_csv(output_file, index=False)
        
        print(f" Processing completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        return final_df
        
    except Exception as e:
        print(f" Error processing CSV file: {str(e)}")
        return None

# Example usage
if __name__ == "__main__":
    input_file = "/home/ubuntu/job_descriptions.csv"  # Replace with your input file path
    output_file = "/home/ubuntu/job_descriptions_modified.csv"   # Replace with your desired output file path
    
    # You can adjust the batch_size based on your system's memory capacity
    processed_df = process_csv(input_file, output_file, batch_size=1000)


🚀 Starting CSV processing at 2024-11-24 10:35:25
📂 Reading file: /home/ubuntu/job_descriptions.csv
📊 Total rows to process: 1,615,940

⏳ Processing batches:


Processing:   0%|          | 8/1616 [00:00<01:33, 17.25it/s]


📈 Processed 5,000 rows...


Processing:   1%|          | 12/1616 [00:00<01:35, 16.76it/s]


📈 Processed 10,000 rows...


Processing:   1%|          | 18/1616 [00:01<01:32, 17.19it/s]


📈 Processed 15,000 rows...


Processing:   1%|▏         | 22/1616 [00:01<01:27, 18.29it/s]


📈 Processed 20,000 rows...


Processing:   2%|▏         | 28/1616 [00:01<01:35, 16.60it/s]


📈 Processed 25,000 rows...


Processing:   2%|▏         | 32/1616 [00:01<01:28, 17.99it/s]


📈 Processed 30,000 rows...


Processing:   2%|▏         | 38/1616 [00:02<01:28, 17.79it/s]


📈 Processed 35,000 rows...


Processing:   3%|▎         | 42/1616 [00:02<01:32, 16.97it/s]


📈 Processed 40,000 rows...


Processing:   3%|▎         | 48/1616 [00:02<01:34, 16.62it/s]


📈 Processed 45,000 rows...


Processing:   3%|▎         | 52/1616 [00:03<01:35, 16.38it/s]


📈 Processed 50,000 rows...


Processing:   4%|▎         | 58/1616 [00:03<01:26, 18.08it/s]


📈 Processed 55,000 rows...


Processing:   4%|▍         | 62/1616 [00:03<01:29, 17.31it/s]


📈 Processed 60,000 rows...


Processing:   4%|▍         | 68/1616 [00:03<01:29, 17.27it/s]


📈 Processed 65,000 rows...


Processing:   4%|▍         | 72/1616 [00:04<01:30, 17.02it/s]


📈 Processed 70,000 rows...


Processing:   5%|▍         | 78/1616 [00:04<01:31, 16.79it/s]


📈 Processed 75,000 rows...


Processing:   5%|▌         | 82/1616 [00:04<01:34, 16.24it/s]


📈 Processed 80,000 rows...


Processing:   5%|▌         | 88/1616 [00:05<01:23, 18.21it/s]


📈 Processed 85,000 rows...


Processing:   6%|▌         | 92/1616 [00:05<01:27, 17.38it/s]


📈 Processed 90,000 rows...


Processing:   6%|▌         | 98/1616 [00:05<01:26, 17.56it/s]


📈 Processed 95,000 rows...


Processing:   6%|▋         | 102/1616 [00:05<01:31, 16.62it/s]


📈 Processed 100,000 rows...


Processing:   7%|▋         | 106/1616 [00:06<02:02, 12.35it/s]


📈 Processed 105,000 rows...


Processing:   7%|▋         | 112/1616 [00:06<01:37, 15.38it/s]


📈 Processed 110,000 rows...


Processing:   7%|▋         | 116/1616 [00:07<01:45, 14.20it/s]


📈 Processed 115,000 rows...


Processing:   8%|▊         | 122/1616 [00:07<01:44, 14.27it/s]


📈 Processed 120,000 rows...


Processing:   8%|▊         | 128/1616 [00:07<01:42, 14.54it/s]


📈 Processed 125,000 rows...


Processing:   8%|▊         | 132/1616 [00:08<01:48, 13.71it/s]


📈 Processed 130,000 rows...


Processing:   8%|▊         | 136/1616 [00:08<01:35, 15.57it/s]


📈 Processed 135,000 rows...


Processing:   9%|▉         | 142/1616 [00:08<01:57, 12.49it/s]


📈 Processed 140,000 rows...


Processing:   9%|▉         | 148/1616 [00:09<01:34, 15.57it/s]


📈 Processed 145,000 rows...


Processing:   9%|▉         | 152/1616 [00:09<01:41, 14.45it/s]


📈 Processed 150,000 rows...


Processing:  10%|▉         | 158/1616 [00:10<01:41, 14.41it/s]


📈 Processed 155,000 rows...


Processing:  10%|█         | 162/1616 [00:10<01:44, 13.93it/s]


📈 Processed 160,000 rows...


Processing:  10%|█         | 166/1616 [00:10<01:29, 16.23it/s]


📈 Processed 165,000 rows...


Processing:  11%|█         | 170/1616 [00:10<01:37, 14.85it/s]


📈 Processed 170,000 rows...


Processing:  11%|█         | 178/1616 [00:11<01:29, 16.09it/s]


📈 Processed 175,000 rows...


Processing:  11%|█▏        | 182/1616 [00:11<01:36, 14.91it/s]


📈 Processed 180,000 rows...


Processing:  12%|█▏        | 188/1616 [00:12<01:34, 15.12it/s]


📈 Processed 185,000 rows...


Processing:  12%|█▏        | 192/1616 [00:12<01:42, 13.84it/s]


📈 Processed 190,000 rows...


Processing:  12%|█▏        | 196/1616 [00:12<01:51, 12.78it/s]


📈 Processed 195,000 rows...


Processing:  12%|█▎        | 202/1616 [00:13<01:26, 16.29it/s]


📈 Processed 200,000 rows...


Processing:  13%|█▎        | 206/1616 [00:13<01:32, 15.32it/s]


📈 Processed 205,000 rows...


Processing:  13%|█▎        | 212/1616 [00:13<01:33, 14.96it/s]


📈 Processed 210,000 rows...


Processing:  13%|█▎        | 218/1616 [00:14<01:33, 14.91it/s]


📈 Processed 215,000 rows...


Processing:  14%|█▎        | 222/1616 [00:14<01:38, 14.18it/s]


📈 Processed 220,000 rows...


Processing:  14%|█▍        | 226/1616 [00:14<01:26, 16.14it/s]


📈 Processed 225,000 rows...


Processing:  14%|█▍        | 230/1616 [00:15<01:32, 14.98it/s]


📈 Processed 230,000 rows...


Processing:  15%|█▍        | 238/1616 [00:15<01:24, 16.36it/s]


📈 Processed 235,000 rows...


Processing:  15%|█▍        | 242/1616 [00:15<01:42, 13.37it/s]


📈 Processed 240,000 rows...


Processing:  15%|█▌        | 248/1616 [00:16<01:36, 14.19it/s]


📈 Processed 245,000 rows...


Processing:  15%|█▌        | 250/1616 [00:16<01:29, 15.21it/s]


📈 Processed 250,000 rows...


Processing:  16%|█▌        | 256/1616 [00:17<01:53, 12.02it/s]


📈 Processed 255,000 rows...


Processing:  16%|█▌        | 262/1616 [00:17<01:25, 15.75it/s]


📈 Processed 260,000 rows...


Processing:  16%|█▋        | 266/1616 [00:17<01:31, 14.82it/s]


📈 Processed 265,000 rows...


Processing:  17%|█▋        | 272/1616 [00:18<01:29, 15.10it/s]


📈 Processed 270,000 rows...


Processing:  17%|█▋        | 278/1616 [00:18<01:28, 15.12it/s]


📈 Processed 275,000 rows...


Processing:  17%|█▋        | 282/1616 [00:18<01:35, 14.00it/s]


📈 Processed 280,000 rows...


Processing:  18%|█▊        | 286/1616 [00:19<01:21, 16.27it/s]


📈 Processed 285,000 rows...


Processing:  18%|█▊        | 292/1616 [00:19<01:39, 13.31it/s]


📈 Processed 290,000 rows...


Processing:  18%|█▊        | 298/1616 [00:19<01:20, 16.37it/s]


📈 Processed 295,000 rows...


Processing:  19%|█▊        | 302/1616 [00:20<01:30, 14.54it/s]


📈 Processed 300,000 rows...


Processing:  19%|█▉        | 308/1616 [00:20<01:33, 14.03it/s]


📈 Processed 305,000 rows...


Processing:  19%|█▉        | 312/1616 [00:20<01:38, 13.29it/s]


📈 Processed 310,000 rows...


Processing:  20%|█▉        | 316/1616 [00:21<01:43, 12.57it/s]


📈 Processed 315,000 rows...


Processing:  20%|█▉        | 322/1616 [00:21<01:19, 16.24it/s]


📈 Processed 320,000 rows...


Processing:  20%|██        | 326/1616 [00:21<01:30, 14.33it/s]


📈 Processed 325,000 rows...


Processing:  21%|██        | 332/1616 [00:22<01:22, 15.50it/s]


📈 Processed 330,000 rows...


Processing:  21%|██        | 338/1616 [00:22<01:22, 15.57it/s]


📈 Processed 335,000 rows...


Processing:  21%|██        | 342/1616 [00:23<01:27, 14.57it/s]


📈 Processed 340,000 rows...


Processing:  21%|██▏       | 346/1616 [00:23<01:17, 16.38it/s]


📈 Processed 345,000 rows...


Processing:  22%|██▏       | 350/1616 [00:23<01:26, 14.65it/s]


📈 Processed 350,000 rows...


Processing:  22%|██▏       | 358/1616 [00:24<01:17, 16.16it/s]


📈 Processed 355,000 rows...


Processing:  22%|██▏       | 362/1616 [00:24<01:24, 14.86it/s]


📈 Processed 360,000 rows...


Processing:  23%|██▎       | 368/1616 [00:24<01:25, 14.66it/s]


📈 Processed 365,000 rows...


Processing:  23%|██▎       | 372/1616 [00:25<01:30, 13.67it/s]


📈 Processed 370,000 rows...


Processing:  23%|██▎       | 376/1616 [00:25<01:17, 16.02it/s]


📈 Processed 375,000 rows...


Processing:  24%|██▎       | 380/1616 [00:25<01:21, 15.13it/s]


📈 Processed 380,000 rows...


Processing:  24%|██▍       | 388/1616 [00:26<01:14, 16.42it/s]


📈 Processed 385,000 rows...


Processing:  24%|██▍       | 392/1616 [00:26<01:22, 14.80it/s]


📈 Processed 390,000 rows...


Processing:  25%|██▍       | 398/1616 [00:26<01:23, 14.54it/s]


📈 Processed 395,000 rows...


Processing:  25%|██▍       | 402/1616 [00:27<01:26, 13.99it/s]


📈 Processed 400,000 rows...


Processing:  25%|██▌       | 406/1616 [00:27<01:14, 16.15it/s]


📈 Processed 405,000 rows...


Processing:  25%|██▌       | 412/1616 [00:27<01:31, 13.17it/s]


📈 Processed 410,000 rows...


Processing:  26%|██▌       | 418/1616 [00:28<01:14, 15.98it/s]


📈 Processed 415,000 rows...


Processing:  26%|██▌       | 422/1616 [00:28<01:21, 14.56it/s]


📈 Processed 420,000 rows...


Processing:  26%|██▋       | 428/1616 [00:28<01:17, 15.27it/s]


📈 Processed 425,000 rows...


Processing:  27%|██▋       | 430/1616 [00:29<01:13, 16.08it/s]


📈 Processed 430,000 rows...


Processing:  27%|██▋       | 436/1616 [00:29<01:34, 12.47it/s]


📈 Processed 435,000 rows...


Processing:  27%|██▋       | 442/1616 [00:29<01:12, 16.29it/s]


📈 Processed 440,000 rows...


Processing:  28%|██▊       | 446/1616 [00:30<01:15, 15.52it/s]


📈 Processed 445,000 rows...


Processing:  28%|██▊       | 452/1616 [00:30<01:13, 15.73it/s]


📈 Processed 450,000 rows...


Processing:  28%|██▊       | 458/1616 [00:31<01:16, 15.08it/s]


📈 Processed 455,000 rows...


Processing:  29%|██▊       | 462/1616 [00:31<01:20, 14.25it/s]


📈 Processed 460,000 rows...


Processing:  29%|██▉       | 466/1616 [00:31<01:09, 16.45it/s]


📈 Processed 465,000 rows...


Processing:  29%|██▉       | 470/1616 [00:31<01:16, 14.93it/s]


📈 Processed 470,000 rows...


Processing:  30%|██▉       | 478/1616 [00:32<01:09, 16.27it/s]


📈 Processed 475,000 rows...


Processing:  30%|██▉       | 482/1616 [00:32<01:16, 14.81it/s]


📈 Processed 480,000 rows...


Processing:  30%|███       | 488/1616 [00:33<01:14, 15.12it/s]


📈 Processed 485,000 rows...


Processing:  30%|███       | 492/1616 [00:33<01:21, 13.87it/s]


📈 Processed 490,000 rows...


Processing:  31%|███       | 496/1616 [00:33<01:28, 12.68it/s]


📈 Processed 495,000 rows...


Processing:  31%|███       | 502/1616 [00:34<01:08, 16.33it/s]


📈 Processed 500,000 rows...


Processing:  31%|███▏      | 506/1616 [00:34<01:10, 15.68it/s]


📈 Processed 505,000 rows...


Processing:  32%|███▏      | 512/1616 [00:34<01:12, 15.22it/s]


📈 Processed 510,000 rows...


Processing:  32%|███▏      | 518/1616 [00:35<01:12, 15.20it/s]


📈 Processed 515,000 rows...


Processing:  32%|███▏      | 522/1616 [00:35<01:23, 13.07it/s]


📈 Processed 520,000 rows...


Processing:  33%|███▎      | 526/1616 [00:35<01:10, 15.57it/s]


📈 Processed 525,000 rows...


Processing:  33%|███▎      | 530/1616 [00:36<01:14, 14.57it/s]


📈 Processed 530,000 rows...


Processing:  33%|███▎      | 538/1616 [00:36<01:08, 15.67it/s]


📈 Processed 535,000 rows...


Processing:  34%|███▎      | 542/1616 [00:36<01:09, 15.36it/s]


📈 Processed 540,000 rows...


Processing:  34%|███▍      | 548/1616 [00:37<01:08, 15.54it/s]


📈 Processed 545,000 rows...


Processing:  34%|███▍      | 550/1616 [00:37<01:05, 16.39it/s]


📈 Processed 550,000 rows...


Processing:  34%|███▍      | 556/1616 [00:37<01:27, 12.15it/s]


📈 Processed 555,000 rows...


Processing:  35%|███▍      | 562/1616 [00:38<01:09, 15.20it/s]


📈 Processed 560,000 rows...


Processing:  35%|███▌      | 566/1616 [00:38<01:12, 14.41it/s]


📈 Processed 565,000 rows...


Processing:  35%|███▌      | 572/1616 [00:39<01:10, 14.91it/s]


📈 Processed 570,000 rows...


Processing:  36%|███▌      | 578/1616 [00:39<01:08, 15.13it/s]


📈 Processed 575,000 rows...


Processing:  36%|███▌      | 580/1616 [00:39<01:04, 16.05it/s]


📈 Processed 580,000 rows...


Processing:  36%|███▋      | 586/1616 [00:40<01:28, 11.59it/s]


📈 Processed 585,000 rows...


Processing:  37%|███▋      | 592/1616 [00:40<01:06, 15.43it/s]


📈 Processed 590,000 rows...


Processing:  37%|███▋      | 596/1616 [00:40<01:12, 14.17it/s]


📈 Processed 595,000 rows...


Processing:  37%|███▋      | 602/1616 [00:41<01:10, 14.30it/s]


📈 Processed 600,000 rows...


Processing:  38%|███▊      | 608/1616 [00:41<01:13, 13.80it/s]


📈 Processed 605,000 rows...


Processing:  38%|███▊      | 612/1616 [00:42<01:13, 13.75it/s]


📈 Processed 610,000 rows...


Processing:  38%|███▊      | 616/1616 [00:42<01:02, 15.93it/s]


📈 Processed 615,000 rows...


Processing:  38%|███▊      | 622/1616 [00:42<01:16, 12.92it/s]


📈 Processed 620,000 rows...


Processing:  39%|███▉      | 628/1616 [00:43<01:00, 16.41it/s]


📈 Processed 625,000 rows...


Processing:  39%|███▉      | 632/1616 [00:43<01:09, 14.11it/s]


📈 Processed 630,000 rows...


Processing:  39%|███▉      | 638/1616 [00:43<01:08, 14.24it/s]


📈 Processed 635,000 rows...


Processing:  40%|███▉      | 642/1616 [00:44<01:11, 13.70it/s]


📈 Processed 640,000 rows...


Processing:  40%|███▉      | 646/1616 [00:44<01:00, 15.96it/s]


📈 Processed 645,000 rows...


Processing:  40%|████      | 650/1616 [00:44<01:07, 14.40it/s]


📈 Processed 650,000 rows...


Processing:  41%|████      | 658/1616 [00:45<01:00, 15.94it/s]


📈 Processed 655,000 rows...


Processing:  41%|████      | 662/1616 [00:45<01:06, 14.37it/s]


📈 Processed 660,000 rows...


Processing:  41%|████▏     | 668/1616 [00:45<01:03, 14.91it/s]


📈 Processed 665,000 rows...


Processing:  41%|████▏     | 670/1616 [00:46<00:59, 15.90it/s]


📈 Processed 670,000 rows...


Processing:  42%|████▏     | 676/1616 [00:46<01:15, 12.48it/s]


📈 Processed 675,000 rows...


Processing:  42%|████▏     | 682/1616 [00:46<00:58, 16.02it/s]


📈 Processed 680,000 rows...


Processing:  42%|████▏     | 686/1616 [00:47<01:06, 14.08it/s]


📈 Processed 685,000 rows...


Processing:  43%|████▎     | 692/1616 [00:47<01:02, 14.72it/s]


📈 Processed 690,000 rows...


Processing:  43%|████▎     | 698/1616 [00:48<01:09, 13.18it/s]


📈 Processed 695,000 rows...


Processing:  43%|████▎     | 700/1616 [00:48<01:03, 14.41it/s]


📈 Processed 700,000 rows...


Processing:  44%|████▎     | 706/1616 [00:48<01:15, 12.07it/s]


📈 Processed 705,000 rows...


Processing:  44%|████▍     | 712/1616 [00:49<00:57, 15.82it/s]


📈 Processed 710,000 rows...


Processing:  44%|████▍     | 716/1616 [00:49<01:03, 14.08it/s]


📈 Processed 715,000 rows...


Processing:  45%|████▍     | 722/1616 [00:49<00:59, 15.11it/s]


📈 Processed 720,000 rows...


Processing:  45%|████▌     | 728/1616 [00:50<01:03, 13.90it/s]


📈 Processed 725,000 rows...


Processing:  45%|████▌     | 732/1616 [00:50<01:06, 13.22it/s]


📈 Processed 730,000 rows...


Processing:  46%|████▌     | 736/1616 [00:50<00:56, 15.59it/s]


📈 Processed 735,000 rows...


Processing:  46%|████▌     | 740/1616 [00:51<00:59, 14.81it/s]


📈 Processed 740,000 rows...


Processing:  46%|████▋     | 748/1616 [00:51<00:53, 16.15it/s]


📈 Processed 745,000 rows...


Processing:  47%|████▋     | 752/1616 [00:52<01:00, 14.29it/s]


📈 Processed 750,000 rows...


Processing:  47%|████▋     | 758/1616 [00:52<00:58, 14.55it/s]


📈 Processed 755,000 rows...


Processing:  47%|████▋     | 760/1616 [00:52<00:54, 15.57it/s]


📈 Processed 760,000 rows...


Processing:  47%|████▋     | 766/1616 [00:53<01:10, 11.98it/s]


📈 Processed 765,000 rows...


Processing:  48%|████▊     | 772/1616 [00:53<00:55, 15.30it/s]


📈 Processed 770,000 rows...


Processing:  48%|████▊     | 776/1616 [00:53<00:57, 14.56it/s]


📈 Processed 775,000 rows...


Processing:  48%|████▊     | 782/1616 [00:54<00:54, 15.44it/s]


📈 Processed 780,000 rows...


Processing:  49%|████▉     | 788/1616 [00:54<00:55, 15.04it/s]


📈 Processed 785,000 rows...


Processing:  49%|████▉     | 792/1616 [00:55<00:59, 13.90it/s]


📈 Processed 790,000 rows...


Processing:  49%|████▉     | 796/1616 [00:55<00:50, 16.12it/s]


📈 Processed 795,000 rows...


Processing:  50%|████▉     | 800/1616 [00:55<00:58, 13.97it/s]


📈 Processed 800,000 rows...


Processing:  50%|█████     | 808/1616 [00:56<00:51, 15.82it/s]


📈 Processed 805,000 rows...


Processing:  50%|█████     | 812/1616 [00:56<00:55, 14.36it/s]


📈 Processed 810,000 rows...


Processing:  51%|█████     | 818/1616 [00:56<00:56, 14.11it/s]


📈 Processed 815,000 rows...


Processing:  51%|█████     | 822/1616 [00:57<00:58, 13.53it/s]


📈 Processed 820,000 rows...


Processing:  51%|█████     | 826/1616 [00:57<00:49, 16.04it/s]


📈 Processed 825,000 rows...


Processing:  51%|█████▏    | 830/1616 [00:57<00:55, 14.29it/s]


📈 Processed 830,000 rows...


Processing:  52%|█████▏    | 838/1616 [00:58<00:51, 15.20it/s]


📈 Processed 835,000 rows...


Processing:  52%|█████▏    | 842/1616 [00:58<00:55, 13.89it/s]


📈 Processed 840,000 rows...


Processing:  52%|█████▏    | 848/1616 [00:59<00:53, 14.24it/s]


📈 Processed 845,000 rows...


Processing:  53%|█████▎    | 852/1616 [00:59<00:57, 13.37it/s]


📈 Processed 850,000 rows...


Processing:  53%|█████▎    | 856/1616 [00:59<00:48, 15.75it/s]


📈 Processed 855,000 rows...


Processing:  53%|█████▎    | 860/1616 [00:59<00:51, 14.57it/s]


📈 Processed 860,000 rows...


Processing:  54%|█████▎    | 868/1616 [01:00<00:47, 15.90it/s]


📈 Processed 865,000 rows...


Processing:  54%|█████▍    | 872/1616 [01:00<00:53, 13.99it/s]


📈 Processed 870,000 rows...


Processing:  54%|█████▍    | 878/1616 [01:01<00:52, 14.00it/s]


📈 Processed 875,000 rows...


Processing:  55%|█████▍    | 882/1616 [01:01<00:56, 13.05it/s]


📈 Processed 880,000 rows...


Processing:  55%|█████▍    | 886/1616 [01:01<00:56, 12.98it/s]


📈 Processed 885,000 rows...


Processing:  55%|█████▌    | 892/1616 [01:02<00:45, 15.99it/s]


📈 Processed 890,000 rows...


Processing:  55%|█████▌    | 896/1616 [01:02<00:48, 14.92it/s]


📈 Processed 895,000 rows...


Processing:  56%|█████▌    | 902/1616 [01:02<00:44, 16.00it/s]


📈 Processed 900,000 rows...


Processing:  56%|█████▌    | 908/1616 [01:03<00:44, 15.90it/s]


📈 Processed 905,000 rows...


Processing:  56%|█████▋    | 912/1616 [01:03<00:45, 15.57it/s]


📈 Processed 910,000 rows...


Processing:  57%|█████▋    | 916/1616 [01:03<00:41, 16.97it/s]


📈 Processed 915,000 rows...


Processing:  57%|█████▋    | 922/1616 [01:04<00:47, 14.68it/s]


📈 Processed 920,000 rows...


Processing:  57%|█████▋    | 928/1616 [01:04<00:40, 17.02it/s]


📈 Processed 925,000 rows...


Processing:  58%|█████▊    | 932/1616 [01:04<00:42, 15.91it/s]


📈 Processed 930,000 rows...


Processing:  58%|█████▊    | 938/1616 [01:05<00:41, 16.38it/s]


📈 Processed 935,000 rows...


Processing:  58%|█████▊    | 942/1616 [01:05<00:44, 15.05it/s]


📈 Processed 940,000 rows...


Processing:  59%|█████▊    | 946/1616 [01:05<00:46, 14.37it/s]


📈 Processed 945,000 rows...


Processing:  59%|█████▉    | 952/1616 [01:06<00:39, 16.93it/s]


📈 Processed 950,000 rows...


Processing:  59%|█████▉    | 956/1616 [01:06<00:42, 15.49it/s]


📈 Processed 955,000 rows...


Processing:  60%|█████▉    | 962/1616 [01:06<00:40, 16.07it/s]


📈 Processed 960,000 rows...


Processing:  60%|█████▉    | 968/1616 [01:07<00:39, 16.41it/s]


📈 Processed 965,000 rows...


Processing:  60%|██████    | 972/1616 [01:07<00:40, 15.90it/s]


📈 Processed 970,000 rows...


Processing:  60%|██████    | 976/1616 [01:07<00:37, 17.25it/s]


📈 Processed 975,000 rows...


Processing:  61%|██████    | 982/1616 [01:07<00:42, 15.09it/s]


📈 Processed 980,000 rows...


Processing:  61%|██████    | 988/1616 [01:08<00:38, 16.31it/s]


📈 Processed 985,000 rows...


Processing:  61%|██████▏   | 992/1616 [01:08<00:39, 15.68it/s]


📈 Processed 990,000 rows...


Processing:  62%|██████▏   | 998/1616 [01:08<00:38, 16.23it/s]


📈 Processed 995,000 rows...


Processing:  62%|██████▏   | 1002/1616 [01:09<00:39, 15.45it/s]


📈 Processed 1,000,000 rows...


Processing:  62%|██████▏   | 1006/1616 [01:09<00:41, 14.75it/s]


📈 Processed 1,005,000 rows...


Processing:  63%|██████▎   | 1012/1616 [01:09<00:36, 16.55it/s]


📈 Processed 1,010,000 rows...


Processing:  63%|██████▎   | 1016/1616 [01:10<00:37, 15.95it/s]


📈 Processed 1,015,000 rows...


Processing:  63%|██████▎   | 1022/1616 [01:10<00:37, 15.67it/s]


📈 Processed 1,020,000 rows...


Processing:  64%|██████▎   | 1028/1616 [01:10<00:36, 16.04it/s]


📈 Processed 1,025,000 rows...


Processing:  64%|██████▍   | 1032/1616 [01:11<00:37, 15.72it/s]


📈 Processed 1,030,000 rows...


Processing:  64%|██████▍   | 1036/1616 [01:11<00:35, 16.52it/s]


📈 Processed 1,035,000 rows...


Processing:  64%|██████▍   | 1042/1616 [01:11<00:38, 14.77it/s]


📈 Processed 1,040,000 rows...


Processing:  65%|██████▍   | 1048/1616 [01:12<00:32, 17.25it/s]


📈 Processed 1,045,000 rows...


Processing:  65%|██████▌   | 1052/1616 [01:12<00:34, 16.45it/s]


📈 Processed 1,050,000 rows...


Processing:  65%|██████▌   | 1058/1616 [01:12<00:32, 17.04it/s]


📈 Processed 1,055,000 rows...


Processing:  66%|██████▌   | 1062/1616 [01:13<00:35, 15.71it/s]


📈 Processed 1,060,000 rows...


Processing:  66%|██████▌   | 1066/1616 [01:13<00:36, 15.21it/s]


📈 Processed 1,065,000 rows...


Processing:  66%|██████▋   | 1072/1616 [01:13<00:30, 17.65it/s]


📈 Processed 1,070,000 rows...


Processing:  67%|██████▋   | 1076/1616 [01:13<00:32, 16.63it/s]


📈 Processed 1,075,000 rows...


Processing:  67%|██████▋   | 1082/1616 [01:14<00:31, 16.79it/s]


📈 Processed 1,080,000 rows...


Processing:  67%|██████▋   | 1088/1616 [01:14<00:32, 16.01it/s]


📈 Processed 1,085,000 rows...


Processing:  68%|██████▊   | 1092/1616 [01:14<00:33, 15.62it/s]


📈 Processed 1,090,000 rows...


Processing:  68%|██████▊   | 1096/1616 [01:15<00:30, 17.27it/s]


📈 Processed 1,095,000 rows...


Processing:  68%|██████▊   | 1102/1616 [01:15<00:35, 14.48it/s]


📈 Processed 1,100,000 rows...


Processing:  69%|██████▊   | 1108/1616 [01:15<00:29, 17.11it/s]


📈 Processed 1,105,000 rows...


Processing:  69%|██████▉   | 1112/1616 [01:16<00:32, 15.31it/s]


📈 Processed 1,110,000 rows...


Processing:  69%|██████▉   | 1118/1616 [01:16<00:31, 16.00it/s]


📈 Processed 1,115,000 rows...


Processing:  69%|██████▉   | 1122/1616 [01:16<00:32, 15.13it/s]


📈 Processed 1,120,000 rows...


Processing:  70%|██████▉   | 1126/1616 [01:17<00:33, 14.66it/s]


📈 Processed 1,125,000 rows...


Processing:  70%|███████   | 1132/1616 [01:17<00:27, 17.31it/s]


📈 Processed 1,130,000 rows...


Processing:  70%|███████   | 1136/1616 [01:17<00:29, 16.27it/s]


📈 Processed 1,135,000 rows...


Processing:  71%|███████   | 1142/1616 [01:18<00:29, 15.83it/s]


📈 Processed 1,140,000 rows...


Processing:  71%|███████   | 1148/1616 [01:18<00:31, 15.06it/s]


📈 Processed 1,145,000 rows...


Processing:  71%|███████▏  | 1152/1616 [01:18<00:30, 15.17it/s]


📈 Processed 1,150,000 rows...


Processing:  72%|███████▏  | 1156/1616 [01:19<00:27, 16.80it/s]


📈 Processed 1,155,000 rows...


Processing:  72%|███████▏  | 1162/1616 [01:19<00:31, 14.52it/s]


📈 Processed 1,160,000 rows...


Processing:  72%|███████▏  | 1168/1616 [01:19<00:26, 16.75it/s]


📈 Processed 1,165,000 rows...


Processing:  73%|███████▎  | 1172/1616 [01:20<00:30, 14.54it/s]


📈 Processed 1,170,000 rows...


Processing:  73%|███████▎  | 1178/1616 [01:20<00:29, 14.83it/s]


📈 Processed 1,175,000 rows...


Processing:  73%|███████▎  | 1182/1616 [01:20<00:29, 14.93it/s]


📈 Processed 1,180,000 rows...


Processing:  73%|███████▎  | 1186/1616 [01:21<00:25, 16.76it/s]


📈 Processed 1,185,000 rows...


Processing:  74%|███████▍  | 1192/1616 [01:21<00:28, 14.71it/s]


📈 Processed 1,190,000 rows...


Processing:  74%|███████▍  | 1198/1616 [01:21<00:24, 17.36it/s]


📈 Processed 1,195,000 rows...


Processing:  74%|███████▍  | 1202/1616 [01:22<00:25, 16.38it/s]


📈 Processed 1,200,000 rows...


Processing:  75%|███████▍  | 1208/1616 [01:22<00:25, 16.06it/s]


📈 Processed 1,205,000 rows...


Processing:  75%|███████▌  | 1212/1616 [01:22<00:25, 15.64it/s]


📈 Processed 1,210,000 rows...


Processing:  75%|███████▌  | 1216/1616 [01:22<00:23, 17.19it/s]


📈 Processed 1,215,000 rows...


Processing:  76%|███████▌  | 1222/1616 [01:23<00:26, 14.79it/s]


📈 Processed 1,220,000 rows...


Processing:  76%|███████▌  | 1228/1616 [01:23<00:22, 17.06it/s]


📈 Processed 1,225,000 rows...


Processing:  76%|███████▌  | 1232/1616 [01:23<00:23, 16.17it/s]


📈 Processed 1,230,000 rows...


Processing:  77%|███████▋  | 1238/1616 [01:24<00:23, 16.02it/s]


📈 Processed 1,235,000 rows...


Processing:  77%|███████▋  | 1242/1616 [01:24<00:24, 15.28it/s]


📈 Processed 1,240,000 rows...


Processing:  77%|███████▋  | 1246/1616 [01:24<00:21, 17.00it/s]


📈 Processed 1,245,000 rows...


Processing:  77%|███████▋  | 1252/1616 [01:25<00:24, 14.69it/s]


📈 Processed 1,250,000 rows...


Processing:  78%|███████▊  | 1258/1616 [01:25<00:21, 16.89it/s]


📈 Processed 1,255,000 rows...


Processing:  78%|███████▊  | 1262/1616 [01:25<00:22, 15.94it/s]


📈 Processed 1,260,000 rows...


Processing:  78%|███████▊  | 1268/1616 [01:26<00:21, 16.12it/s]


📈 Processed 1,265,000 rows...


Processing:  79%|███████▊  | 1272/1616 [01:26<00:21, 15.72it/s]


📈 Processed 1,270,000 rows...


Processing:  79%|███████▉  | 1276/1616 [01:26<00:19, 17.05it/s]


📈 Processed 1,275,000 rows...


Processing:  79%|███████▉  | 1282/1616 [01:27<00:22, 14.77it/s]


📈 Processed 1,280,000 rows...


Processing:  80%|███████▉  | 1288/1616 [01:27<00:18, 17.27it/s]


📈 Processed 1,285,000 rows...


Processing:  80%|███████▉  | 1292/1616 [01:27<00:20, 15.92it/s]


📈 Processed 1,290,000 rows...


Processing:  80%|████████  | 1298/1616 [01:28<00:19, 16.36it/s]


📈 Processed 1,295,000 rows...


Processing:  80%|████████  | 1300/1616 [01:28<00:19, 16.28it/s]


📈 Processed 1,300,000 rows...


Processing:  81%|████████  | 1306/1616 [01:28<00:23, 13.18it/s]


📈 Processed 1,305,000 rows...


Processing:  81%|████████  | 1312/1616 [01:29<00:18, 16.07it/s]


📈 Processed 1,310,000 rows...


Processing:  81%|████████▏ | 1316/1616 [01:29<00:19, 15.20it/s]


📈 Processed 1,315,000 rows...


Processing:  82%|████████▏ | 1322/1616 [01:29<00:18, 15.95it/s]


📈 Processed 1,320,000 rows...


Processing:  82%|████████▏ | 1328/1616 [01:30<00:17, 16.09it/s]


📈 Processed 1,325,000 rows...


Processing:  82%|████████▏ | 1332/1616 [01:30<00:18, 15.35it/s]


📈 Processed 1,330,000 rows...


Processing:  83%|████████▎ | 1336/1616 [01:30<00:16, 16.89it/s]


📈 Processed 1,335,000 rows...


Processing:  83%|████████▎ | 1342/1616 [01:31<00:19, 14.30it/s]


📈 Processed 1,340,000 rows...


Processing:  83%|████████▎ | 1348/1616 [01:31<00:16, 16.34it/s]


📈 Processed 1,345,000 rows...


Processing:  84%|████████▎ | 1352/1616 [01:31<00:16, 15.62it/s]


📈 Processed 1,350,000 rows...


Processing:  84%|████████▍ | 1358/1616 [01:32<00:16, 15.79it/s]


📈 Processed 1,355,000 rows...


Processing:  84%|████████▍ | 1362/1616 [01:32<00:16, 15.27it/s]


📈 Processed 1,360,000 rows...


Processing:  85%|████████▍ | 1366/1616 [01:32<00:14, 16.85it/s]


📈 Processed 1,365,000 rows...


Processing:  85%|████████▍ | 1372/1616 [01:33<00:16, 14.52it/s]


📈 Processed 1,370,000 rows...


Processing:  85%|████████▌ | 1378/1616 [01:33<00:14, 16.25it/s]


📈 Processed 1,375,000 rows...


Processing:  86%|████████▌ | 1382/1616 [01:33<00:17, 13.30it/s]


📈 Processed 1,380,000 rows...


Processing:  86%|████████▌ | 1388/1616 [01:34<00:16, 13.49it/s]


📈 Processed 1,385,000 rows...


Processing:  86%|████████▌ | 1392/1616 [01:34<00:17, 13.14it/s]


📈 Processed 1,390,000 rows...


Processing:  86%|████████▋ | 1396/1616 [01:34<00:14, 15.38it/s]


📈 Processed 1,395,000 rows...


Processing:  87%|████████▋ | 1400/1616 [01:35<00:15, 13.93it/s]


📈 Processed 1,400,000 rows...


Processing:  87%|████████▋ | 1408/1616 [01:35<00:13, 15.55it/s]


📈 Processed 1,405,000 rows...


Processing:  87%|████████▋ | 1412/1616 [01:35<00:14, 13.85it/s]


📈 Processed 1,410,000 rows...


Processing:  88%|████████▊ | 1418/1616 [01:36<00:13, 14.28it/s]


📈 Processed 1,415,000 rows...


Processing:  88%|████████▊ | 1420/1616 [01:36<00:12, 15.37it/s]


📈 Processed 1,420,000 rows...


Processing:  88%|████████▊ | 1426/1616 [01:37<00:16, 11.18it/s]


📈 Processed 1,425,000 rows...


Processing:  89%|████████▊ | 1432/1616 [01:37<00:12, 14.72it/s]


📈 Processed 1,430,000 rows...


Processing:  89%|████████▉ | 1436/1616 [01:37<00:12, 13.86it/s]


📈 Processed 1,435,000 rows...


Processing:  89%|████████▉ | 1442/1616 [01:38<00:11, 14.67it/s]


📈 Processed 1,440,000 rows...


Processing:  90%|████████▉ | 1448/1616 [01:38<00:12, 13.14it/s]


📈 Processed 1,445,000 rows...


Processing:  90%|████████▉ | 1452/1616 [01:39<00:13, 12.15it/s]


📈 Processed 1,450,000 rows...


Processing:  90%|█████████ | 1456/1616 [01:39<00:10, 14.59it/s]


📈 Processed 1,455,000 rows...


Processing:  90%|█████████ | 1460/1616 [01:39<00:11, 13.18it/s]


📈 Processed 1,460,000 rows...


Processing:  91%|█████████ | 1468/1616 [01:40<00:09, 15.33it/s]


📈 Processed 1,465,000 rows...


Processing:  91%|█████████ | 1472/1616 [01:40<00:10, 13.60it/s]


📈 Processed 1,470,000 rows...


Processing:  91%|█████████▏| 1478/1616 [01:41<00:09, 14.46it/s]


📈 Processed 1,475,000 rows...


Processing:  92%|█████████▏| 1480/1616 [01:41<00:08, 15.48it/s]


📈 Processed 1,480,000 rows...


Processing:  92%|█████████▏| 1486/1616 [01:41<00:10, 12.21it/s]


📈 Processed 1,485,000 rows...


Processing:  92%|█████████▏| 1492/1616 [01:42<00:07, 15.83it/s]


📈 Processed 1,490,000 rows...


Processing:  93%|█████████▎| 1496/1616 [01:42<00:08, 13.36it/s]


📈 Processed 1,495,000 rows...


Processing:  93%|█████████▎| 1502/1616 [01:42<00:08, 13.85it/s]


📈 Processed 1,500,000 rows...


Processing:  93%|█████████▎| 1508/1616 [01:43<00:08, 13.47it/s]


📈 Processed 1,505,000 rows...


Processing:  94%|█████████▎| 1512/1616 [01:43<00:08, 11.59it/s]


📈 Processed 1,510,000 rows...


Processing:  94%|█████████▍| 1516/1616 [01:43<00:07, 14.04it/s]


📈 Processed 1,515,000 rows...


Processing:  94%|█████████▍| 1522/1616 [01:44<00:06, 14.56it/s]


📈 Processed 1,520,000 rows...


Processing:  95%|█████████▍| 1528/1616 [01:44<00:05, 14.79it/s]


📈 Processed 1,525,000 rows...


Processing:  95%|█████████▍| 1532/1616 [01:45<00:06, 12.50it/s]


📈 Processed 1,530,000 rows...


Processing:  95%|█████████▌| 1538/1616 [01:45<00:05, 14.00it/s]


📈 Processed 1,535,000 rows...


Processing:  95%|█████████▌| 1540/1616 [01:45<00:05, 15.04it/s]


📈 Processed 1,540,000 rows...


Processing:  96%|█████████▌| 1546/1616 [01:46<00:06, 11.52it/s]


📈 Processed 1,545,000 rows...


Processing:  96%|█████████▌| 1552/1616 [01:46<00:04, 15.33it/s]


📈 Processed 1,550,000 rows...


Processing:  96%|█████████▋| 1558/1616 [01:47<00:04, 14.28it/s]


📈 Processed 1,555,000 rows...


Processing:  97%|█████████▋| 1562/1616 [01:47<00:04, 12.51it/s]


📈 Processed 1,560,000 rows...


Processing:  97%|█████████▋| 1568/1616 [01:48<00:03, 12.80it/s]


📈 Processed 1,565,000 rows...


Processing:  97%|█████████▋| 1572/1616 [01:48<00:03, 11.54it/s]


📈 Processed 1,570,000 rows...


Processing:  98%|█████████▊| 1576/1616 [01:48<00:02, 14.28it/s]


📈 Processed 1,575,000 rows...


Processing:  98%|█████████▊| 1582/1616 [01:49<00:02, 14.46it/s]


📈 Processed 1,580,000 rows...


Processing:  98%|█████████▊| 1588/1616 [01:49<00:01, 14.49it/s]


📈 Processed 1,585,000 rows...


Processing:  99%|█████████▊| 1592/1616 [01:49<00:01, 13.73it/s]


📈 Processed 1,590,000 rows...


Processing:  99%|█████████▉| 1598/1616 [01:50<00:01, 14.06it/s]


📈 Processed 1,595,000 rows...


Processing:  99%|█████████▉| 1600/1616 [01:50<00:01, 15.28it/s]


📈 Processed 1,600,000 rows...


Processing:  99%|█████████▉| 1606/1616 [01:51<00:00, 11.49it/s]


📈 Processed 1,605,000 rows...


Processing: 100%|█████████▉| 1612/1616 [01:51<00:00, 15.42it/s]


📈 Processed 1,610,000 rows...


Processing: 100%|██████████| 1616/1616 [01:51<00:00, 14.46it/s]



📈 Processed 1,615,000 rows...

🔄 Combining all batches...

📊 Processing Statistics:
Total records processed: 1,615,940
Records with valid salary range: 1,615,940
Unique locations: 216
Date range: 2021-09-15 to 2023-09-15

💾 Saving processed data to /home/ubuntu/job_descriptions_modified.csv
✅ Processing completed at 2024-11-24 10:37:45
