### Import Libraries
In this section, we import the necessary libraries for our data processing task.

In [2]:
from pyspark import SparkContext, SparkConf
import re
import math

### Create Cleaning Pipeline
This function initializes a data cleaning pipeline using Spark. It reads raw text data, groups lines into job postings, and applies text cleaning and field extraction. The pipeline processes the data in several steps:
- **Map to Job Postings**: Groups lines into individual job postings based on start and end markers.
- **Clean Text**: Converts text to lowercase, removes punctuation, and standardizes common terms.
- **Extract Fields**: Uses regular expressions to extract specific fields like job title, description, skills, and salary from each posting.
- **Remove Duplicates**: Ensures each job posting is unique by removing duplicates.
- **Calculate Statistics**: Computes statistics such as the total number of postings and those with missing values.

In [3]:
def create_cleaning_pipeline(sc, input_file):
    """
    Enhanced cleaning pipeline with better preprocessing
    """
    sc.setLogLevel("WARN")
    
    # Read the raw text file
    raw_data = sc.textFile(input_file, minPartitions=100)

    # Step 1: MAP - Group lines into job postings
    def map_to_job_postings(lines):
        current_posting = []
        for line in lines:
            if "<<<START_JOB_POSTING>>>" in line:
                if current_posting:
                    yield "\n".join(current_posting)
                current_posting = []
            elif "<<<END_JOB_POSTING>>>" in line:
                if current_posting:
                    yield "\n".join(current_posting)
                current_posting = []
            else:
                current_posting.append(line)
    
    job_postings = raw_data.mapPartitions(map_to_job_postings)
    
    # Step 2: MAP - Clean text and extract fields
    def clean_text(text):
        if not text:
            return ""
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        replacements = {
            'yrs': 'years',
            'yr': 'year',
            'exp': 'experience',
            'min': 'minimum',
            'max': 'maximum'
        }
        for old, new in replacements.items():
            text = text.replace(old, new)
        return ' '.join(text.split())
    
    def extract_fields(posting):
        try:
            # Extract all required fields
            title_match = re.search(r'\n(.*?) at (.*?)\nPosted', posting)
            job_title = clean_text(title_match.group(1)) if title_match else None
            
            # Extract description (everything between ABOUT THE ROLE and REQUIREMENTS)
            desc_match = re.search(r'ABOUT THE ROLE\n-+\n(.*?)\n\nREQUIREMENTS', posting, re.DOTALL)
            job_description = clean_text(desc_match.group(1)) if desc_match else None
            
            # Extract skills
            skills_match = re.search(r'SKILLS NEEDED\n-+\n(.*?)\n\n', posting, re.DOTALL)
            job_skills = clean_text(skills_match.group(1)) if skills_match else None
            
            # Extract industry
            industry_match = re.search(r'"Industry":"([^"]+)"', posting)
            job_industry = clean_text(industry_match.group(1)) if industry_match else None
            
            # Extract location
            location_match = re.search(r'Location: (.*?)\n', posting)
            job_location = clean_text(location_match.group(1)) if location_match else None
            
            # Extract experience
            exp_match = re.search(r'Experience: (.*?)\n', posting)
            experience_level = clean_text(exp_match.group(1)) if exp_match else None
            
            # Extract salary range
            salary_match = re.search(r'Salary: (.*?)\n', posting)
            salary_str = salary_match.group(1) if salary_match else ""
            nums = re.findall(r'\d+', salary_str)
            min_salary = int(nums[0]) * 1000 if len(nums) >= 2 else None
            max_salary = int(nums[1]) * 1000 if len(nums) >= 2 else None
            
            # Create tuple with all fields
            result = (
                job_title,
                job_description,
                job_skills,
                job_industry,
                job_location,
                experience_level,
                min_salary,
                max_salary
            )
            
            # Check if any required fields are missing
            if any(x is None for x in result):
                return None
                
            return result
            
        except Exception as e:
            return None
    
    # Extract and clean fields
    cleaned_postings = job_postings.map(extract_fields).filter(lambda x: x is not None)
    
    # Remove duplicates based on all fields
    unique_postings = cleaned_postings.distinct()
    
    # Calculate statistics
    total_posts = unique_postings.count()
    posts_with_missing = cleaned_postings.filter(lambda x: None in x).count()
    
    # Calculate salary statistics
    salary_stats = unique_postings.map(
        lambda x: (x[6], x[7])  # min and max salary
    ).filter(
        lambda x: x[0] is not None and x[1] is not None
    ).mapValues(
        lambda x: (x, 1)
    ).reduceByKey(
        lambda x, y: (x[0] + y[0], x[1] + y[1])
    ).mapValues(
        lambda x: x[0] / x[1]  # average salary
    )
    
    print(f"\nProcessing Summary:")

    print_cleaning_statistics(raw_data, job_postings, cleaned_postings, unique_postings)
    
    return unique_postings.collect(), salary_stats.collect()


In [4]:
# Create a new Spark context
conf = SparkConf().setAppName("JobDataCleaning") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g") \
    .set("spark.default.parallelism", "100")
sc = SparkContext(conf=conf)

# Process data
input_file = "hdfs:///user/ubuntu/job_descriptions_unstructured.txt"
final_data, stats = create_cleaning_pipeline(sc, input_file)


In [1]:
def print_cleaning_statistics(raw_data, job_postings, cleaned_postings, unique_postings):

    print("\n=== Data Cleaning Statistics ===")
    print(f"Initial raw lines: {raw_data.count():,}")
    print(f"Initial job postings: {job_postings.count():,}")
    
    # Salary statistics
    salary_stats = unique_postings.filter(lambda x: x[6] is not None and x[7] is not None)
    salary_count = salary_stats.count()
    avg_min_salary = salary_stats.map(lambda x: x[6]).mean()
    avg_max_salary = salary_stats.map(lambda x: x[7]).mean()
    print(f"\nSalary Information:")
    print(f"Postings with salary information: {salary_count:,}")
    print(f"Average minimum salary: ${avg_min_salary:,.2f}")
    print(f"Average maximum salary: ${avg_max_salary:,.2f}")
    
    print("\n=== Processing Complete ===")

In [None]:
# Define columns for the CSV header
columns = [
    "job_title",
    "job_description",
    "job_skills",
    "job_industry",
    "job_location",
    "experience_level",
    "min_salary",
    "max_salary"
]

# Save the cleaned data to CSV
def row_to_csv(row):
    # Convert each field to string and handle None values
    csv_fields = [str(field) if field is not None else '' for field in row]
    # Join fields with commas and properly escape any commas within fields
    return ','.join(f'"{field}"' for field in csv_fields)

# Create header
header = ','.join(columns)

### Write Data to CSV
This section handles the export of processed data to a CSV file. It writes the cleaned job data to a specified output path, ensuring that the data is saved in a structured format for further analysis or use.

In [None]:
import os
import math
from datetime import datetime
    
def write_data_to_csv(sc, final_data, header, output_path, batch_size=5000):
    
    print(f"\nStarting CSV export at {datetime.now().strftime('%H:%M:%S')}")
    print(f"Total records to process: {len(final_data):,}")
    
    # Ensure output directory exists
    output_dir = os.path.dirname(output_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Remove existing file if it exists
    if os.path.exists(output_path):
        os.remove(output_path)
    
    # Write header directly to file
    with open(output_path, 'w') as f:
        f.write(header + '\n')
    
    # Process batches
    num_batches = math.ceil(len(final_data) / batch_size)
    print(f"Processing in {num_batches} batches of {batch_size} records each")
    
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(final_data))
        batch_data = final_data[start_idx:end_idx]
        
        print(f"\nProcessing batch {i+1}/{num_batches} ({start_idx:,} to {end_idx:,} records)")
        
        # Convert batch to CSV format
        batch_rdd = sc.parallelize(batch_data).map(row_to_csv)
        
        # Save batch directly to main file
        batch_rows = batch_rdd.collect()
        with open(output_path, 'a') as f:
            f.write('\n'.join(batch_rows) + '\n')
        
        print(f"Batch {i+1} complete - {len(batch_rows):,} records written")
    
    print(f"\nCSV export completed at {datetime.now().strftime('%H:%M:%S')}")
    print(f"Data saved to: {output_path}")
    
    # Verify file
    with open(output_path, 'r') as f:
        total_lines = sum(1 for _ in f)
    print(f"Total lines in output file: {total_lines:,} (including header)")

def write_stats_to_csv(sc, stats, stats_header, output_path):
    
    print(f"\nProcessing salary statistics at {datetime.now().strftime('%H:%M:%S')}")
    
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Remove existing file if it exists
    if os.path.exists(output_path):
        os.remove(output_path)
    
    # Format statistics with proper number formatting
    formatted_stats = [(company, f"{float(avg_salary):,.2f}") for company, avg_salary in stats]
    
    # Calculate some summary statistics
    salaries = [float(avg_salary) for _, avg_salary in formatted_stats]
    if salaries:
        avg_salary = sum(salaries) / len(salaries)
        min_salary = min(salaries)
        max_salary = max(salaries)
        
        print("\nSalary Statistics Summary:")
        print(f"Number of companies: {len(formatted_stats):,}")
        print(f"Average salary across all companies: ${avg_salary:,.2f}")
        print(f"Minimum average salary: ${min_salary:,.2f}")
        print(f"Maximum average salary: ${max_salary:,.2f}")
        
        # Show sample of data
        print("\nSample of company statistics (first 5):")
        for company, avg in formatted_stats[:5]:
            print(f"Company: {company}, Average Salary: ${float(avg):,.2f}")
    
    # Write to file
    with open(output_path, 'w') as f:
        f.write(stats_header + '\n')
        for company, avg in formatted_stats:
            f.write(f"{company},{avg}\n")
    
    print(f"\nStatistics saved to: {output_path}")
    print(f"Total companies in statistics file: {len(formatted_stats):,}")


In [None]:
write_data_to_csv(sc, final_data, header, "cleaned_job_data_using_spark.csv")



Starting CSV export at 08:39:38
Total records to process: 1,610,355
Processing in 323 batches of 5000 records each

Processing batch 1/323 (0 to 5,000 records)


                                                                                

Batch 1 complete - 5,000 records written

Processing batch 2/323 (5,000 to 10,000 records)


                                                                                

Batch 2 complete - 5,000 records written

Processing batch 3/323 (10,000 to 15,000 records)


                                                                                

Batch 3 complete - 5,000 records written

Processing batch 4/323 (15,000 to 20,000 records)


                                                                                

Batch 4 complete - 5,000 records written

Processing batch 5/323 (20,000 to 25,000 records)


                                                                                

Batch 5 complete - 5,000 records written

Processing batch 6/323 (25,000 to 30,000 records)


                                                                                

Batch 6 complete - 5,000 records written

Processing batch 7/323 (30,000 to 35,000 records)


                                                                                

Batch 7 complete - 5,000 records written

Processing batch 8/323 (35,000 to 40,000 records)


                                                                                

Batch 8 complete - 5,000 records written

Processing batch 9/323 (40,000 to 45,000 records)


                                                                                

Batch 9 complete - 5,000 records written

Processing batch 10/323 (45,000 to 50,000 records)


                                                                                

Batch 10 complete - 5,000 records written

Processing batch 11/323 (50,000 to 55,000 records)


                                                                                

Batch 11 complete - 5,000 records written

Processing batch 12/323 (55,000 to 60,000 records)


                                                                                

Batch 12 complete - 5,000 records written

Processing batch 13/323 (60,000 to 65,000 records)


                                                                                

Batch 13 complete - 5,000 records written

Processing batch 14/323 (65,000 to 70,000 records)


                                                                                

Batch 14 complete - 5,000 records written

Processing batch 15/323 (70,000 to 75,000 records)


                                                                                

Batch 15 complete - 5,000 records written

Processing batch 16/323 (75,000 to 80,000 records)


                                                                                

Batch 16 complete - 5,000 records written

Processing batch 17/323 (80,000 to 85,000 records)


                                                                                

Batch 17 complete - 5,000 records written

Processing batch 18/323 (85,000 to 90,000 records)


                                                                                

Batch 18 complete - 5,000 records written

Processing batch 19/323 (90,000 to 95,000 records)


                                                                                

Batch 19 complete - 5,000 records written

Processing batch 20/323 (95,000 to 100,000 records)


                                                                                

Batch 20 complete - 5,000 records written

Processing batch 21/323 (100,000 to 105,000 records)


                                                                                

Batch 21 complete - 5,000 records written

Processing batch 22/323 (105,000 to 110,000 records)


                                                                                

Batch 22 complete - 5,000 records written

Processing batch 23/323 (110,000 to 115,000 records)


                                                                                

Batch 23 complete - 5,000 records written

Processing batch 24/323 (115,000 to 120,000 records)


                                                                                

Batch 24 complete - 5,000 records written

Processing batch 25/323 (120,000 to 125,000 records)


                                                                                

Batch 25 complete - 5,000 records written

Processing batch 26/323 (125,000 to 130,000 records)


                                                                                

Batch 26 complete - 5,000 records written

Processing batch 27/323 (130,000 to 135,000 records)


                                                                                

Batch 27 complete - 5,000 records written

Processing batch 28/323 (135,000 to 140,000 records)


                                                                                

Batch 28 complete - 5,000 records written

Processing batch 29/323 (140,000 to 145,000 records)


                                                                                

Batch 29 complete - 5,000 records written

Processing batch 30/323 (145,000 to 150,000 records)


                                                                                

Batch 30 complete - 5,000 records written

Processing batch 31/323 (150,000 to 155,000 records)


                                                                                

Batch 31 complete - 5,000 records written

Processing batch 32/323 (155,000 to 160,000 records)


                                                                                

Batch 32 complete - 5,000 records written

Processing batch 33/323 (160,000 to 165,000 records)


                                                                                

Batch 33 complete - 5,000 records written

Processing batch 34/323 (165,000 to 170,000 records)


                                                                                

Batch 34 complete - 5,000 records written

Processing batch 35/323 (170,000 to 175,000 records)


                                                                                

Batch 35 complete - 5,000 records written

Processing batch 36/323 (175,000 to 180,000 records)


                                                                                

Batch 36 complete - 5,000 records written

Processing batch 37/323 (180,000 to 185,000 records)


                                                                                

Batch 37 complete - 5,000 records written

Processing batch 38/323 (185,000 to 190,000 records)


                                                                                

Batch 38 complete - 5,000 records written

Processing batch 39/323 (190,000 to 195,000 records)


                                                                                

Batch 39 complete - 5,000 records written

Processing batch 40/323 (195,000 to 200,000 records)


                                                                                

Batch 40 complete - 5,000 records written

Processing batch 41/323 (200,000 to 205,000 records)


                                                                                

Batch 41 complete - 5,000 records written

Processing batch 42/323 (205,000 to 210,000 records)


                                                                                

Batch 42 complete - 5,000 records written

Processing batch 43/323 (210,000 to 215,000 records)


                                                                                

Batch 43 complete - 5,000 records written

Processing batch 44/323 (215,000 to 220,000 records)


                                                                                

Batch 44 complete - 5,000 records written

Processing batch 45/323 (220,000 to 225,000 records)


                                                                                

Batch 45 complete - 5,000 records written

Processing batch 46/323 (225,000 to 230,000 records)


                                                                                

Batch 46 complete - 5,000 records written

Processing batch 47/323 (230,000 to 235,000 records)


                                                                                

Batch 47 complete - 5,000 records written

Processing batch 48/323 (235,000 to 240,000 records)


                                                                                

Batch 48 complete - 5,000 records written

Processing batch 49/323 (240,000 to 245,000 records)


                                                                                

Batch 49 complete - 5,000 records written

Processing batch 50/323 (245,000 to 250,000 records)


                                                                                

Batch 50 complete - 5,000 records written

Processing batch 51/323 (250,000 to 255,000 records)


                                                                                

Batch 51 complete - 5,000 records written

Processing batch 52/323 (255,000 to 260,000 records)


                                                                                

Batch 52 complete - 5,000 records written

Processing batch 53/323 (260,000 to 265,000 records)


                                                                                

Batch 53 complete - 5,000 records written

Processing batch 54/323 (265,000 to 270,000 records)


                                                                                

Batch 54 complete - 5,000 records written

Processing batch 55/323 (270,000 to 275,000 records)


                                                                                

Batch 55 complete - 5,000 records written

Processing batch 56/323 (275,000 to 280,000 records)


                                                                                

Batch 56 complete - 5,000 records written

Processing batch 57/323 (280,000 to 285,000 records)


                                                                                

Batch 57 complete - 5,000 records written

Processing batch 58/323 (285,000 to 290,000 records)


                                                                                

Batch 58 complete - 5,000 records written

Processing batch 59/323 (290,000 to 295,000 records)


                                                                                

Batch 59 complete - 5,000 records written

Processing batch 60/323 (295,000 to 300,000 records)


                                                                                

Batch 60 complete - 5,000 records written

Processing batch 61/323 (300,000 to 305,000 records)


                                                                                

Batch 61 complete - 5,000 records written

Processing batch 62/323 (305,000 to 310,000 records)


                                                                                

Batch 62 complete - 5,000 records written

Processing batch 63/323 (310,000 to 315,000 records)


                                                                                

Batch 63 complete - 5,000 records written

Processing batch 64/323 (315,000 to 320,000 records)


                                                                                

Batch 64 complete - 5,000 records written

Processing batch 65/323 (320,000 to 325,000 records)


                                                                                

Batch 65 complete - 5,000 records written

Processing batch 66/323 (325,000 to 330,000 records)


                                                                                

Batch 66 complete - 5,000 records written

Processing batch 67/323 (330,000 to 335,000 records)


                                                                                

Batch 67 complete - 5,000 records written

Processing batch 68/323 (335,000 to 340,000 records)


                                                                                

Batch 68 complete - 5,000 records written

Processing batch 69/323 (340,000 to 345,000 records)


                                                                                

Batch 69 complete - 5,000 records written

Processing batch 70/323 (345,000 to 350,000 records)


                                                                                

Batch 70 complete - 5,000 records written

Processing batch 71/323 (350,000 to 355,000 records)


                                                                                

Batch 71 complete - 5,000 records written

Processing batch 72/323 (355,000 to 360,000 records)


                                                                                

Batch 72 complete - 5,000 records written

Processing batch 73/323 (360,000 to 365,000 records)


                                                                                

Batch 73 complete - 5,000 records written

Processing batch 74/323 (365,000 to 370,000 records)


                                                                                

Batch 74 complete - 5,000 records written

Processing batch 75/323 (370,000 to 375,000 records)


                                                                                

Batch 75 complete - 5,000 records written

Processing batch 76/323 (375,000 to 380,000 records)


                                                                                

Batch 76 complete - 5,000 records written

Processing batch 77/323 (380,000 to 385,000 records)


                                                                                

Batch 77 complete - 5,000 records written

Processing batch 78/323 (385,000 to 390,000 records)


                                                                                

Batch 78 complete - 5,000 records written

Processing batch 79/323 (390,000 to 395,000 records)


                                                                                

Batch 79 complete - 5,000 records written

Processing batch 80/323 (395,000 to 400,000 records)


                                                                                

Batch 80 complete - 5,000 records written

Processing batch 81/323 (400,000 to 405,000 records)


                                                                                

Batch 81 complete - 5,000 records written

Processing batch 82/323 (405,000 to 410,000 records)


                                                                                

Batch 82 complete - 5,000 records written

Processing batch 83/323 (410,000 to 415,000 records)


                                                                                

Batch 83 complete - 5,000 records written

Processing batch 84/323 (415,000 to 420,000 records)


                                                                                

Batch 84 complete - 5,000 records written

Processing batch 85/323 (420,000 to 425,000 records)


                                                                                

Batch 85 complete - 5,000 records written

Processing batch 86/323 (425,000 to 430,000 records)


                                                                                

Batch 86 complete - 5,000 records written

Processing batch 87/323 (430,000 to 435,000 records)


                                                                                

Batch 87 complete - 5,000 records written

Processing batch 88/323 (435,000 to 440,000 records)


                                                                                

Batch 88 complete - 5,000 records written

Processing batch 89/323 (440,000 to 445,000 records)


                                                                                

Batch 89 complete - 5,000 records written

Processing batch 90/323 (445,000 to 450,000 records)


                                                                                

Batch 90 complete - 5,000 records written

Processing batch 91/323 (450,000 to 455,000 records)


                                                                                

Batch 91 complete - 5,000 records written

Processing batch 92/323 (455,000 to 460,000 records)


                                                                                

Batch 92 complete - 5,000 records written

Processing batch 93/323 (460,000 to 465,000 records)


                                                                                

Batch 93 complete - 5,000 records written

Processing batch 94/323 (465,000 to 470,000 records)


                                                                                

Batch 94 complete - 5,000 records written

Processing batch 95/323 (470,000 to 475,000 records)


                                                                                

Batch 95 complete - 5,000 records written

Processing batch 96/323 (475,000 to 480,000 records)


                                                                                

Batch 96 complete - 5,000 records written

Processing batch 97/323 (480,000 to 485,000 records)


                                                                                

Batch 97 complete - 5,000 records written

Processing batch 98/323 (485,000 to 490,000 records)


                                                                                

Batch 98 complete - 5,000 records written

Processing batch 99/323 (490,000 to 495,000 records)


                                                                                

Batch 99 complete - 5,000 records written

Processing batch 100/323 (495,000 to 500,000 records)


                                                                                

Batch 100 complete - 5,000 records written

Processing batch 101/323 (500,000 to 505,000 records)


                                                                                

Batch 101 complete - 5,000 records written

Processing batch 102/323 (505,000 to 510,000 records)


                                                                                

Batch 102 complete - 5,000 records written

Processing batch 103/323 (510,000 to 515,000 records)


                                                                                

Batch 103 complete - 5,000 records written

Processing batch 104/323 (515,000 to 520,000 records)


                                                                                

Batch 104 complete - 5,000 records written

Processing batch 105/323 (520,000 to 525,000 records)


                                                                                

Batch 105 complete - 5,000 records written

Processing batch 106/323 (525,000 to 530,000 records)


                                                                                

Batch 106 complete - 5,000 records written

Processing batch 107/323 (530,000 to 535,000 records)


                                                                                

Batch 107 complete - 5,000 records written

Processing batch 108/323 (535,000 to 540,000 records)


                                                                                

Batch 108 complete - 5,000 records written

Processing batch 109/323 (540,000 to 545,000 records)


                                                                                

Batch 109 complete - 5,000 records written

Processing batch 110/323 (545,000 to 550,000 records)


                                                                                

Batch 110 complete - 5,000 records written

Processing batch 111/323 (550,000 to 555,000 records)


                                                                                

Batch 111 complete - 5,000 records written

Processing batch 112/323 (555,000 to 560,000 records)


                                                                                

Batch 112 complete - 5,000 records written

Processing batch 113/323 (560,000 to 565,000 records)


                                                                                

Batch 113 complete - 5,000 records written

Processing batch 114/323 (565,000 to 570,000 records)


                                                                                

Batch 114 complete - 5,000 records written

Processing batch 115/323 (570,000 to 575,000 records)


                                                                                

Batch 115 complete - 5,000 records written

Processing batch 116/323 (575,000 to 580,000 records)


                                                                                

Batch 116 complete - 5,000 records written

Processing batch 117/323 (580,000 to 585,000 records)


                                                                                

Batch 117 complete - 5,000 records written

Processing batch 118/323 (585,000 to 590,000 records)


                                                                                

Batch 118 complete - 5,000 records written

Processing batch 119/323 (590,000 to 595,000 records)


                                                                                

Batch 119 complete - 5,000 records written

Processing batch 120/323 (595,000 to 600,000 records)


                                                                                

Batch 120 complete - 5,000 records written

Processing batch 121/323 (600,000 to 605,000 records)


                                                                                

Batch 121 complete - 5,000 records written

Processing batch 122/323 (605,000 to 610,000 records)


                                                                                

Batch 122 complete - 5,000 records written

Processing batch 123/323 (610,000 to 615,000 records)


                                                                                

Batch 123 complete - 5,000 records written

Processing batch 124/323 (615,000 to 620,000 records)


                                                                                

Batch 124 complete - 5,000 records written

Processing batch 125/323 (620,000 to 625,000 records)


                                                                                

Batch 125 complete - 5,000 records written

Processing batch 126/323 (625,000 to 630,000 records)


                                                                                

Batch 126 complete - 5,000 records written

Processing batch 127/323 (630,000 to 635,000 records)


                                                                                

Batch 127 complete - 5,000 records written

Processing batch 128/323 (635,000 to 640,000 records)


                                                                                

Batch 128 complete - 5,000 records written

Processing batch 129/323 (640,000 to 645,000 records)


                                                                                

Batch 129 complete - 5,000 records written

Processing batch 130/323 (645,000 to 650,000 records)


                                                                                

Batch 130 complete - 5,000 records written

Processing batch 131/323 (650,000 to 655,000 records)


                                                                                

Batch 131 complete - 5,000 records written

Processing batch 132/323 (655,000 to 660,000 records)


                                                                                

Batch 132 complete - 5,000 records written

Processing batch 133/323 (660,000 to 665,000 records)


                                                                                

Batch 133 complete - 5,000 records written

Processing batch 134/323 (665,000 to 670,000 records)


                                                                                

Batch 134 complete - 5,000 records written

Processing batch 135/323 (670,000 to 675,000 records)


                                                                                

Batch 135 complete - 5,000 records written

Processing batch 136/323 (675,000 to 680,000 records)


                                                                                

Batch 136 complete - 5,000 records written

Processing batch 137/323 (680,000 to 685,000 records)


                                                                                

Batch 137 complete - 5,000 records written

Processing batch 138/323 (685,000 to 690,000 records)


                                                                                

Batch 138 complete - 5,000 records written

Processing batch 139/323 (690,000 to 695,000 records)


                                                                                

Batch 139 complete - 5,000 records written

Processing batch 140/323 (695,000 to 700,000 records)


                                                                                

Batch 140 complete - 5,000 records written

Processing batch 141/323 (700,000 to 705,000 records)


                                                                                

Batch 141 complete - 5,000 records written

Processing batch 142/323 (705,000 to 710,000 records)


                                                                                

Batch 142 complete - 5,000 records written

Processing batch 143/323 (710,000 to 715,000 records)


                                                                                

Batch 143 complete - 5,000 records written

Processing batch 144/323 (715,000 to 720,000 records)


                                                                                

Batch 144 complete - 5,000 records written

Processing batch 145/323 (720,000 to 725,000 records)


                                                                                

Batch 145 complete - 5,000 records written

Processing batch 146/323 (725,000 to 730,000 records)


                                                                                

Batch 146 complete - 5,000 records written

Processing batch 147/323 (730,000 to 735,000 records)


                                                                                

Batch 147 complete - 5,000 records written

Processing batch 148/323 (735,000 to 740,000 records)


                                                                                

Batch 148 complete - 5,000 records written

Processing batch 149/323 (740,000 to 745,000 records)


                                                                                

Batch 149 complete - 5,000 records written

Processing batch 150/323 (745,000 to 750,000 records)


                                                                                

Batch 150 complete - 5,000 records written

Processing batch 151/323 (750,000 to 755,000 records)


                                                                                

Batch 151 complete - 5,000 records written

Processing batch 152/323 (755,000 to 760,000 records)


                                                                                

Batch 152 complete - 5,000 records written

Processing batch 153/323 (760,000 to 765,000 records)


                                                                                

Batch 153 complete - 5,000 records written

Processing batch 154/323 (765,000 to 770,000 records)


                                                                                

Batch 154 complete - 5,000 records written

Processing batch 155/323 (770,000 to 775,000 records)


                                                                                

Batch 155 complete - 5,000 records written

Processing batch 156/323 (775,000 to 780,000 records)


                                                                                

Batch 156 complete - 5,000 records written

Processing batch 157/323 (780,000 to 785,000 records)


                                                                                

Batch 157 complete - 5,000 records written

Processing batch 158/323 (785,000 to 790,000 records)


                                                                                

Batch 158 complete - 5,000 records written

Processing batch 159/323 (790,000 to 795,000 records)


                                                                                

Batch 159 complete - 5,000 records written

Processing batch 160/323 (795,000 to 800,000 records)


                                                                                

Batch 160 complete - 5,000 records written

Processing batch 161/323 (800,000 to 805,000 records)


                                                                                

Batch 161 complete - 5,000 records written

Processing batch 162/323 (805,000 to 810,000 records)


                                                                                

Batch 162 complete - 5,000 records written

Processing batch 163/323 (810,000 to 815,000 records)


                                                                                

Batch 163 complete - 5,000 records written

Processing batch 164/323 (815,000 to 820,000 records)


                                                                                

Batch 164 complete - 5,000 records written

Processing batch 165/323 (820,000 to 825,000 records)


                                                                                

Batch 165 complete - 5,000 records written

Processing batch 166/323 (825,000 to 830,000 records)


                                                                                

Batch 166 complete - 5,000 records written

Processing batch 167/323 (830,000 to 835,000 records)


                                                                                

Batch 167 complete - 5,000 records written

Processing batch 168/323 (835,000 to 840,000 records)


                                                                                

Batch 168 complete - 5,000 records written

Processing batch 169/323 (840,000 to 845,000 records)


                                                                                

Batch 169 complete - 5,000 records written

Processing batch 170/323 (845,000 to 850,000 records)


                                                                                

Batch 170 complete - 5,000 records written

Processing batch 171/323 (850,000 to 855,000 records)


                                                                                

Batch 171 complete - 5,000 records written

Processing batch 172/323 (855,000 to 860,000 records)


                                                                                

Batch 172 complete - 5,000 records written

Processing batch 173/323 (860,000 to 865,000 records)


                                                                                

Batch 173 complete - 5,000 records written

Processing batch 174/323 (865,000 to 870,000 records)


                                                                                

Batch 174 complete - 5,000 records written

Processing batch 175/323 (870,000 to 875,000 records)


                                                                                

Batch 175 complete - 5,000 records written

Processing batch 176/323 (875,000 to 880,000 records)


                                                                                

Batch 176 complete - 5,000 records written

Processing batch 177/323 (880,000 to 885,000 records)


                                                                                

Batch 177 complete - 5,000 records written

Processing batch 178/323 (885,000 to 890,000 records)


                                                                                

Batch 178 complete - 5,000 records written

Processing batch 179/323 (890,000 to 895,000 records)


                                                                                

Batch 179 complete - 5,000 records written

Processing batch 180/323 (895,000 to 900,000 records)


                                                                                

Batch 180 complete - 5,000 records written

Processing batch 181/323 (900,000 to 905,000 records)


                                                                                

Batch 181 complete - 5,000 records written

Processing batch 182/323 (905,000 to 910,000 records)


                                                                                

Batch 182 complete - 5,000 records written

Processing batch 183/323 (910,000 to 915,000 records)


                                                                                

Batch 183 complete - 5,000 records written

Processing batch 184/323 (915,000 to 920,000 records)


                                                                                

Batch 184 complete - 5,000 records written

Processing batch 185/323 (920,000 to 925,000 records)


                                                                                

Batch 185 complete - 5,000 records written

Processing batch 186/323 (925,000 to 930,000 records)


                                                                                

Batch 186 complete - 5,000 records written

Processing batch 187/323 (930,000 to 935,000 records)


                                                                                

Batch 187 complete - 5,000 records written

Processing batch 188/323 (935,000 to 940,000 records)


                                                                                

Batch 188 complete - 5,000 records written

Processing batch 189/323 (940,000 to 945,000 records)


                                                                                

Batch 189 complete - 5,000 records written

Processing batch 190/323 (945,000 to 950,000 records)


                                                                                

Batch 190 complete - 5,000 records written

Processing batch 191/323 (950,000 to 955,000 records)


                                                                                

Batch 191 complete - 5,000 records written

Processing batch 192/323 (955,000 to 960,000 records)


                                                                                

Batch 192 complete - 5,000 records written

Processing batch 193/323 (960,000 to 965,000 records)


                                                                                

Batch 193 complete - 5,000 records written

Processing batch 194/323 (965,000 to 970,000 records)


                                                                                

Batch 194 complete - 5,000 records written

Processing batch 195/323 (970,000 to 975,000 records)


                                                                                

Batch 195 complete - 5,000 records written

Processing batch 196/323 (975,000 to 980,000 records)


                                                                                

Batch 196 complete - 5,000 records written

Processing batch 197/323 (980,000 to 985,000 records)


                                                                                

Batch 197 complete - 5,000 records written

Processing batch 198/323 (985,000 to 990,000 records)


                                                                                

Batch 198 complete - 5,000 records written

Processing batch 199/323 (990,000 to 995,000 records)


                                                                                

Batch 199 complete - 5,000 records written

Processing batch 200/323 (995,000 to 1,000,000 records)


                                                                                

Batch 200 complete - 5,000 records written

Processing batch 201/323 (1,000,000 to 1,005,000 records)


                                                                                

Batch 201 complete - 5,000 records written

Processing batch 202/323 (1,005,000 to 1,010,000 records)


                                                                                

Batch 202 complete - 5,000 records written

Processing batch 203/323 (1,010,000 to 1,015,000 records)


                                                                                

Batch 203 complete - 5,000 records written

Processing batch 204/323 (1,015,000 to 1,020,000 records)


                                                                                

Batch 204 complete - 5,000 records written

Processing batch 205/323 (1,020,000 to 1,025,000 records)


                                                                                

Batch 205 complete - 5,000 records written

Processing batch 206/323 (1,025,000 to 1,030,000 records)


                                                                                

Batch 206 complete - 5,000 records written

Processing batch 207/323 (1,030,000 to 1,035,000 records)


                                                                                

Batch 207 complete - 5,000 records written

Processing batch 208/323 (1,035,000 to 1,040,000 records)


                                                                                

Batch 208 complete - 5,000 records written

Processing batch 209/323 (1,040,000 to 1,045,000 records)


                                                                                

Batch 209 complete - 5,000 records written

Processing batch 210/323 (1,045,000 to 1,050,000 records)


                                                                                

Batch 210 complete - 5,000 records written

Processing batch 211/323 (1,050,000 to 1,055,000 records)


                                                                                

Batch 211 complete - 5,000 records written

Processing batch 212/323 (1,055,000 to 1,060,000 records)


                                                                                

Batch 212 complete - 5,000 records written

Processing batch 213/323 (1,060,000 to 1,065,000 records)


                                                                                

Batch 213 complete - 5,000 records written

Processing batch 214/323 (1,065,000 to 1,070,000 records)


                                                                                

Batch 214 complete - 5,000 records written

Processing batch 215/323 (1,070,000 to 1,075,000 records)


                                                                                

Batch 215 complete - 5,000 records written

Processing batch 216/323 (1,075,000 to 1,080,000 records)


                                                                                

Batch 216 complete - 5,000 records written

Processing batch 217/323 (1,080,000 to 1,085,000 records)


                                                                                

Batch 217 complete - 5,000 records written

Processing batch 218/323 (1,085,000 to 1,090,000 records)


                                                                                

Batch 218 complete - 5,000 records written

Processing batch 219/323 (1,090,000 to 1,095,000 records)


                                                                                

Batch 219 complete - 5,000 records written

Processing batch 220/323 (1,095,000 to 1,100,000 records)


                                                                                

Batch 220 complete - 5,000 records written

Processing batch 221/323 (1,100,000 to 1,105,000 records)


                                                                                

Batch 221 complete - 5,000 records written

Processing batch 222/323 (1,105,000 to 1,110,000 records)


                                                                                

Batch 222 complete - 5,000 records written

Processing batch 223/323 (1,110,000 to 1,115,000 records)


                                                                                

Batch 223 complete - 5,000 records written

Processing batch 224/323 (1,115,000 to 1,120,000 records)


                                                                                

Batch 224 complete - 5,000 records written

Processing batch 225/323 (1,120,000 to 1,125,000 records)


                                                                                

Batch 225 complete - 5,000 records written

Processing batch 226/323 (1,125,000 to 1,130,000 records)


                                                                                

Batch 226 complete - 5,000 records written

Processing batch 227/323 (1,130,000 to 1,135,000 records)


                                                                                

Batch 227 complete - 5,000 records written

Processing batch 228/323 (1,135,000 to 1,140,000 records)


                                                                                

Batch 228 complete - 5,000 records written

Processing batch 229/323 (1,140,000 to 1,145,000 records)


                                                                                

Batch 229 complete - 5,000 records written

Processing batch 230/323 (1,145,000 to 1,150,000 records)


                                                                                

Batch 230 complete - 5,000 records written

Processing batch 231/323 (1,150,000 to 1,155,000 records)


                                                                                

Batch 231 complete - 5,000 records written

Processing batch 232/323 (1,155,000 to 1,160,000 records)


                                                                                

Batch 232 complete - 5,000 records written

Processing batch 233/323 (1,160,000 to 1,165,000 records)


                                                                                

Batch 233 complete - 5,000 records written

Processing batch 234/323 (1,165,000 to 1,170,000 records)


                                                                                

Batch 234 complete - 5,000 records written

Processing batch 235/323 (1,170,000 to 1,175,000 records)


                                                                                

Batch 235 complete - 5,000 records written

Processing batch 236/323 (1,175,000 to 1,180,000 records)


                                                                                

Batch 236 complete - 5,000 records written

Processing batch 237/323 (1,180,000 to 1,185,000 records)


                                                                                

Batch 237 complete - 5,000 records written

Processing batch 238/323 (1,185,000 to 1,190,000 records)


                                                                                

Batch 238 complete - 5,000 records written

Processing batch 239/323 (1,190,000 to 1,195,000 records)


                                                                                

Batch 239 complete - 5,000 records written

Processing batch 240/323 (1,195,000 to 1,200,000 records)


                                                                                

Batch 240 complete - 5,000 records written

Processing batch 241/323 (1,200,000 to 1,205,000 records)


                                                                                

Batch 241 complete - 5,000 records written

Processing batch 242/323 (1,205,000 to 1,210,000 records)


                                                                                

Batch 242 complete - 5,000 records written

Processing batch 243/323 (1,210,000 to 1,215,000 records)


                                                                                

Batch 243 complete - 5,000 records written

Processing batch 244/323 (1,215,000 to 1,220,000 records)


                                                                                

Batch 244 complete - 5,000 records written

Processing batch 245/323 (1,220,000 to 1,225,000 records)


                                                                                

Batch 245 complete - 5,000 records written

Processing batch 246/323 (1,225,000 to 1,230,000 records)


                                                                                

Batch 246 complete - 5,000 records written

Processing batch 247/323 (1,230,000 to 1,235,000 records)


                                                                                

Batch 247 complete - 5,000 records written

Processing batch 248/323 (1,235,000 to 1,240,000 records)


                                                                                

Batch 248 complete - 5,000 records written

Processing batch 249/323 (1,240,000 to 1,245,000 records)


                                                                                

Batch 249 complete - 5,000 records written

Processing batch 250/323 (1,245,000 to 1,250,000 records)


                                                                                

Batch 250 complete - 5,000 records written

Processing batch 251/323 (1,250,000 to 1,255,000 records)


                                                                                

Batch 251 complete - 5,000 records written

Processing batch 252/323 (1,255,000 to 1,260,000 records)


                                                                                

Batch 252 complete - 5,000 records written

Processing batch 253/323 (1,260,000 to 1,265,000 records)


                                                                                

Batch 253 complete - 5,000 records written

Processing batch 254/323 (1,265,000 to 1,270,000 records)


                                                                                

Batch 254 complete - 5,000 records written

Processing batch 255/323 (1,270,000 to 1,275,000 records)


                                                                                

Batch 255 complete - 5,000 records written

Processing batch 256/323 (1,275,000 to 1,280,000 records)


                                                                                

Batch 256 complete - 5,000 records written

Processing batch 257/323 (1,280,000 to 1,285,000 records)


                                                                                

Batch 257 complete - 5,000 records written

Processing batch 258/323 (1,285,000 to 1,290,000 records)


                                                                                

Batch 258 complete - 5,000 records written

Processing batch 259/323 (1,290,000 to 1,295,000 records)


                                                                                

Batch 259 complete - 5,000 records written

Processing batch 260/323 (1,295,000 to 1,300,000 records)


                                                                                

Batch 260 complete - 5,000 records written

Processing batch 261/323 (1,300,000 to 1,305,000 records)


                                                                                

Batch 261 complete - 5,000 records written

Processing batch 262/323 (1,305,000 to 1,310,000 records)


                                                                                

Batch 262 complete - 5,000 records written

Processing batch 263/323 (1,310,000 to 1,315,000 records)


                                                                                

Batch 263 complete - 5,000 records written

Processing batch 264/323 (1,315,000 to 1,320,000 records)


                                                                                

Batch 264 complete - 5,000 records written

Processing batch 265/323 (1,320,000 to 1,325,000 records)


                                                                                

Batch 265 complete - 5,000 records written

Processing batch 266/323 (1,325,000 to 1,330,000 records)


                                                                                

Batch 266 complete - 5,000 records written

Processing batch 267/323 (1,330,000 to 1,335,000 records)


                                                                                

Batch 267 complete - 5,000 records written

Processing batch 268/323 (1,335,000 to 1,340,000 records)


                                                                                

Batch 268 complete - 5,000 records written

Processing batch 269/323 (1,340,000 to 1,345,000 records)


                                                                                

Batch 269 complete - 5,000 records written

Processing batch 270/323 (1,345,000 to 1,350,000 records)


                                                                                

Batch 270 complete - 5,000 records written

Processing batch 271/323 (1,350,000 to 1,355,000 records)


                                                                                

Batch 271 complete - 5,000 records written

Processing batch 272/323 (1,355,000 to 1,360,000 records)


                                                                                

Batch 272 complete - 5,000 records written

Processing batch 273/323 (1,360,000 to 1,365,000 records)


                                                                                

Batch 273 complete - 5,000 records written

Processing batch 274/323 (1,365,000 to 1,370,000 records)


                                                                                

Batch 274 complete - 5,000 records written

Processing batch 275/323 (1,370,000 to 1,375,000 records)


                                                                                

Batch 275 complete - 5,000 records written

Processing batch 276/323 (1,375,000 to 1,380,000 records)


                                                                                

Batch 276 complete - 5,000 records written

Processing batch 277/323 (1,380,000 to 1,385,000 records)


                                                                                

Batch 277 complete - 5,000 records written

Processing batch 278/323 (1,385,000 to 1,390,000 records)


                                                                                

Batch 278 complete - 5,000 records written

Processing batch 279/323 (1,390,000 to 1,395,000 records)


                                                                                

Batch 279 complete - 5,000 records written

Processing batch 280/323 (1,395,000 to 1,400,000 records)


                                                                                

Batch 280 complete - 5,000 records written

Processing batch 281/323 (1,400,000 to 1,405,000 records)


                                                                                

Batch 281 complete - 5,000 records written

Processing batch 282/323 (1,405,000 to 1,410,000 records)


                                                                                

Batch 282 complete - 5,000 records written

Processing batch 283/323 (1,410,000 to 1,415,000 records)


                                                                                

Batch 283 complete - 5,000 records written

Processing batch 284/323 (1,415,000 to 1,420,000 records)


                                                                                

Batch 284 complete - 5,000 records written

Processing batch 285/323 (1,420,000 to 1,425,000 records)


                                                                                

Batch 285 complete - 5,000 records written

Processing batch 286/323 (1,425,000 to 1,430,000 records)


                                                                                

Batch 286 complete - 5,000 records written

Processing batch 287/323 (1,430,000 to 1,435,000 records)


                                                                                

Batch 287 complete - 5,000 records written

Processing batch 288/323 (1,435,000 to 1,440,000 records)


                                                                                

Batch 288 complete - 5,000 records written

Processing batch 289/323 (1,440,000 to 1,445,000 records)


                                                                                

Batch 289 complete - 5,000 records written

Processing batch 290/323 (1,445,000 to 1,450,000 records)


                                                                                

Batch 290 complete - 5,000 records written

Processing batch 291/323 (1,450,000 to 1,455,000 records)


                                                                                

Batch 291 complete - 5,000 records written

Processing batch 292/323 (1,455,000 to 1,460,000 records)


                                                                                

Batch 292 complete - 5,000 records written

Processing batch 293/323 (1,460,000 to 1,465,000 records)


                                                                                

Batch 293 complete - 5,000 records written

Processing batch 294/323 (1,465,000 to 1,470,000 records)


                                                                                

Batch 294 complete - 5,000 records written

Processing batch 295/323 (1,470,000 to 1,475,000 records)


                                                                                

Batch 295 complete - 5,000 records written

Processing batch 296/323 (1,475,000 to 1,480,000 records)


                                                                                

Batch 296 complete - 5,000 records written

Processing batch 297/323 (1,480,000 to 1,485,000 records)


                                                                                

Batch 297 complete - 5,000 records written

Processing batch 298/323 (1,485,000 to 1,490,000 records)


                                                                                

Batch 298 complete - 5,000 records written

Processing batch 299/323 (1,490,000 to 1,495,000 records)


                                                                                

Batch 299 complete - 5,000 records written

Processing batch 300/323 (1,495,000 to 1,500,000 records)


                                                                                

Batch 300 complete - 5,000 records written

Processing batch 301/323 (1,500,000 to 1,505,000 records)


                                                                                

Batch 301 complete - 5,000 records written

Processing batch 302/323 (1,505,000 to 1,510,000 records)


                                                                                

Batch 302 complete - 5,000 records written

Processing batch 303/323 (1,510,000 to 1,515,000 records)


                                                                                

Batch 303 complete - 5,000 records written

Processing batch 304/323 (1,515,000 to 1,520,000 records)


                                                                                

Batch 304 complete - 5,000 records written

Processing batch 305/323 (1,520,000 to 1,525,000 records)


                                                                                

Batch 305 complete - 5,000 records written

Processing batch 306/323 (1,525,000 to 1,530,000 records)


                                                                                

Batch 306 complete - 5,000 records written

Processing batch 307/323 (1,530,000 to 1,535,000 records)


                                                                                

Batch 307 complete - 5,000 records written

Processing batch 308/323 (1,535,000 to 1,540,000 records)


                                                                                

Batch 308 complete - 5,000 records written

Processing batch 309/323 (1,540,000 to 1,545,000 records)


                                                                                

Batch 309 complete - 5,000 records written

Processing batch 310/323 (1,545,000 to 1,550,000 records)


                                                                                

Batch 310 complete - 5,000 records written

Processing batch 311/323 (1,550,000 to 1,555,000 records)


                                                                                

Batch 311 complete - 5,000 records written

Processing batch 312/323 (1,555,000 to 1,560,000 records)


                                                                                

Batch 312 complete - 5,000 records written

Processing batch 313/323 (1,560,000 to 1,565,000 records)


                                                                                

Batch 313 complete - 5,000 records written

Processing batch 314/323 (1,565,000 to 1,570,000 records)


                                                                                

Batch 314 complete - 5,000 records written

Processing batch 315/323 (1,570,000 to 1,575,000 records)


                                                                                

Batch 315 complete - 5,000 records written

Processing batch 316/323 (1,575,000 to 1,580,000 records)


                                                                                

Batch 316 complete - 5,000 records written

Processing batch 317/323 (1,580,000 to 1,585,000 records)


                                                                                

Batch 317 complete - 5,000 records written

Processing batch 318/323 (1,585,000 to 1,590,000 records)


                                                                                

Batch 318 complete - 5,000 records written

Processing batch 319/323 (1,590,000 to 1,595,000 records)


                                                                                

Batch 319 complete - 5,000 records written

Processing batch 320/323 (1,595,000 to 1,600,000 records)


                                                                                

Batch 320 complete - 5,000 records written

Processing batch 321/323 (1,600,000 to 1,605,000 records)


                                                                                

Batch 321 complete - 5,000 records written

Processing batch 322/323 (1,605,000 to 1,610,000 records)


                                                                                

Batch 322 complete - 5,000 records written

Processing batch 323/323 (1,610,000 to 1,610,355 records)


                                                                                

Batch 323 complete - 355 records written

CSV export completed at 08:52:57
Data saved to: cleaned_job_data_using_spark.csv
Total lines in output file: 1,610,356 (including header)


NameError: name 'stats_header' is not defined

In [None]:
""" # Save statistics to a separate CSV
stats_header = "company,avg_salary"
stats_csv_rdd = sc.parallelize([stats_header]) \
    .union(sc.parallelize(stats).map(lambda x: f"{x[0]},{x[1]}"))
stats_csv_rdd.coalesce(1).saveAsTextFile("salary_stats.csv")

print(f"\nResults saved to:")
print(f"- Job data: {output_path}")
print(f"- Salary statistics: salary_stats.csv") """

: 

In [None]:
+

SyntaxError: invalid syntax (15193383.py, line 1)