In [1]:
# imports
import os
import sys
import time

# pyspark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, count, explode, split, trim, lit, udf, broadcast, 
    when, lower, regexp_extract
)
from pyspark.sql.types import StringType

print('Imports loaded')

Imports loaded


In [2]:
# setup paths
cwd = os.getcwd()

# detect if we're in notebooks/ or project root
if 'notebooks' in cwd:
    PROJECT_ROOT = os.path.dirname(os.path.dirname(cwd))  # TWO levels up
else:
    PROJECT_ROOT = cwd

RAW_DATA_DIR = os.path.join(PROJECT_ROOT, 'ingest_job_postings', 'raw_data')
OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'ingest_job_postings', 'output')
RAW_JOBS_DIR = os.path.join(OUTPUT_DIR, 'raw_job_postings')

print(f'Project root: {PROJECT_ROOT}')
print(f'Raw data: {RAW_DATA_DIR}')
print(f'Output: {OUTPUT_DIR}')

# check data size
import subprocess
try:
    result = subprocess.run(
        ['du', '-sh', RAW_JOBS_DIR], 
        capture_output=True, 
        text=True
    )
    data_size = result.stdout.split()[0]
    print(f'\nData size: {data_size}')
except:
    print('\nCould not determine data size')

Project root: /home/developer/project
Raw data: /home/developer/project/raw_data/job_postings
Output: /home/developer/project/output

Data size: 149M


In [3]:
# create spark session
print('Creating Spark session')

spark = SparkSession.builder \
    .appName('JobPostingPipeline') \
    .config('spark.driver.memory', '8g') \
    .config('spark.sql.shuffle.partitions', '16') \
    .config('spark.sql.adaptive.enabled', 'true') \
    .getOrCreate()

spark.sparkContext.setLogLevel('WARN')

print(f'Spark version: {spark.version}')
print('Spark session created')

Creating Spark session


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/27 01:25:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 4.1.1
Spark session created


In [4]:
# load job postings from parquet
print(f'Loading data from {RAW_JOBS_DIR}')

jobs_df = spark.read.parquet(RAW_JOBS_DIR)
total_count = jobs_df.count()

print(f'\nLoaded {total_count:,} job postings')
print('\nRecords by source:')
jobs_df.groupBy('source').count().show()

Loading data from /home/developer/project/output/raw_job_postings

Loaded 1,348,711 job postings

Records by source:
+---------+-------+
|   source|  count|
+---------+-------+
|   indeed|    100|
|glassdoor|    157|
| linkedin|1348454|
+---------+-------+



In [5]:
# load ground-truth skills
skills_path = os.path.join(RAW_DATA_DIR, 'job_skills.csv')
print(f'Loading ground-truth skills from {skills_path}')

skills_df = spark.read.csv(skills_path, header=True, inferSchema=True)
skills_count = skills_df.count()

print(f'\nLoaded {skills_count:,} skill records')
print('\nSample:')
skills_df.show(3, truncate=80)

Loading ground-truth skills from /home/developer/project/raw_data/job_postings/job_skills.csv


                                                                                


Loaded 1,296,381 skill records

Sample:
+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|                                                                        job_link|                                                                      job_skills|
+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|https://www.linkedin.com/jobs/view/housekeeper-i-pt-at-jacksonville-state-uni...|Building Custodial Services, Cleaning, Janitorial Services, Materials Handlin...|
|https://www.linkedin.com/jobs/view/assistant-general-manager-huntington-4131-...|Customer service, Restaurant management, Food safety, Training, Supervision, ...|
|https://www.linkedin.com/jobs/view/school-based-behavior-analyst-at-ccres-edu...|Applied Behavior Analysis (ABA), Data analysis, Behaviora

In [6]:
# separate linkedin from other sources
linkedin_jobs = jobs_df.filter(col('source') == 'linkedin')
other_jobs = jobs_df.filter(col('source') != 'linkedin')

print(f'LinkedIn jobs: {linkedin_jobs.count():,}')
print(f'Other jobs (Indeed/Glassdoor): {other_jobs.count():,}')

LinkedIn jobs: 1,348,454
Other jobs (Indeed/Glassdoor): 257


In [7]:
# join linkedin with skills
print('Joining LinkedIn jobs with skills')
start_time = time.time()

linkedin_with_skills = linkedin_jobs.join(
    broadcast(skills_df),
    linkedin_jobs['job_link'] == skills_df['job_link'],
    'left'
).drop(skills_df['job_link']).withColumnRenamed('job_skills', 'skills')

elapsed = time.time() - start_time
print(f'Join completed in {elapsed:.1f}s')

with_skills = linkedin_with_skills.filter(col('skills').isNotNull()).count()
linkedin_count = linkedin_with_skills.count()
pct = 100 * with_skills / linkedin_count if linkedin_count > 0 else 0
print(f'LinkedIn jobs with skills: {with_skills:,} / {linkedin_count:,} ({pct:.1f}%)')

Joining LinkedIn jobs with skills
Join completed in 0.0s


                                                                                

LinkedIn jobs with skills: 1,294,374 / 1,348,454 (96.0%)


In [8]:
# add empty skills to other jobs
other_jobs_with_skills = other_jobs.withColumn('skills', lit(None).cast('string'))

# combine all
all_jobs = linkedin_with_skills.union(other_jobs_with_skills)
print(f'\nTotal jobs: {all_jobs.count():,}')


Total jobs: 1,348,711


In [9]:
# extract seniority from job titles using regex
# this gives us 5 levels instead of just 2 from job_level column
# order matters: check intern/junior/senior BEFORE manager (Sr. Manager = senior)

print('Extracting seniority from job titles')

all_jobs = all_jobs.withColumn(
    'seniority',
    when(col('job_title').rlike(r'(?i)\b(intern|internship)\b'), 'intern')
    .when(col('job_title').rlike(r'(?i)\b(jr\.?|junior|entry|graduate|trainee)\b'), 'junior')
    .when(col('job_title').rlike(r'(?i)\b(sr\.?|senior|ii|iii|iv)\b'), 'senior')
    .when(col('job_title').rlike(r'(?i)\b(principal|staff|head of|director|vp|chief|lead)\b'), 'lead/principal')
    .when(col('job_title').rlike(r'(?i)\bassociate\b'), 'junior')
    .otherwise('mid')
)

# register as temp view for SQL queries
all_jobs.createOrReplaceTempView('jobs')

print('\nSeniority distribution (extracted from job titles):')
all_jobs.groupBy('seniority').count().orderBy(col('count').desc()).show()

Extracting seniority from job titles

Seniority distribution (extracted from job titles):


[Stage 32:====>                                                   (2 + 13) / 24]

+--------------+-------+
|     seniority|  count|
+--------------+-------+
|           mid|1081113|
|        senior| 150007|
|lead/principal|  84749|
|        junior|  31258|
|        intern|   1584|
+--------------+-------+



                                                                                

In [10]:
# sample with skills
print('\nSample LinkedIn jobs WITH skills:')
all_jobs.filter(
    (col('source') == 'linkedin') & (col('skills').isNotNull())
).select(
    'job_title', 'company', 'skills'
).show(5, truncate=70)


Sample LinkedIn jobs WITH skills:


                                                                                

+-----------------------------------------+-----------------------+----------------------------------------------------------------------+
|                                job_title|                company|                                                                skills|
+-----------------------------------------+-----------------------+----------------------------------------------------------------------+
|            Community Nurse (RMN -Band 6)|                   Hays|Community Mental Health Nurse, Trusted Assessment Model, Clinical d...|
|Software Lead Engineer in Cudahy, WI, USA|         Energy Jobline|EV/HEV, Micro Hybrid systems, MATLAB, Simulink, Auto code generatio...|
|                 Nursing Clinical Adjunct|State of South Carolina|Nursing, Clinical Management, Student Evaluations, Student Attendan...|
|                  Virtualization Engineer|                   Epic|VMware Engineer, VMware vSphere, vSphere PowerCLI, GoLang, Python, ...|
|                  LEAD SAL

In [11]:
# build skill dictionary from all skills
print('Building skill dictionary')
start_time = time.time()

skill_counts = all_jobs \
    .filter(col('skills').isNotNull()) \
    .select(explode(split(col('skills'), ',')).alias('skill')) \
    .select(trim(col('skill')).alias('skill')) \
    .filter(col('skill') != '') \
    .groupBy('skill') \
    .agg(count('*').alias('count')) \
    .orderBy(col('count').desc())

skill_counts.cache()
unique_skills = skill_counts.count()
elapsed = time.time() - start_time

print(f'\nFound {unique_skills:,} unique skills in {elapsed:.1f}s')
print('\nTop 25 skills:')
skill_counts.show(25, truncate=False)

Building skill dictionary





Found 3,298,453 unique skills in 6.6s

Top 25 skills:
+----------------------+------+
|skill                 |count |
+----------------------+------+
|Communication         |368202|
|Teamwork              |226205|
|Leadership            |184292|
|Customer service      |166158|
|Communication skills  |116169|
|Customer Service      |110400|
|Problem Solving       |102020|
|Sales                 |92718 |
|Problemsolving        |92489 |
|Nursing               |87419 |
|Collaboration         |86774 |
|Training              |83178 |
|Project Management    |81080 |
|Communication Skills  |78700 |
|Attention to detail   |75448 |
|Microsoft Office Suite|73351 |
|Time management       |72460 |
|Time Management       |69752 |
|Scheduling            |64081 |
|Microsoft Office      |60260 |
|Multitasking          |59216 |
|Adaptability          |58687 |
|Patient Care          |58369 |
|Attention to Detail   |57658 |
|Flexibility           |56530 |
+----------------------+------+
only showing top 

                                                                                

In [12]:
# inline embedding string builder function
# this creates natural language descriptions for embedding models

def build_embedding_string(title, company, location, skills, seniority, job_type):
    # build a natural language string from job posting fields
    # template:
    # "Role of {title} at {company} in {location}. Required skills: {skills}.
    # Experience level: {seniority}. Work type: {job_type}."
    parts = []
    
    # title and company
    if title and company and location:
        parts.append(f'Role of {title} at {company} in {location}.')
    elif title and company:
        parts.append(f'Role of {title} at {company}.')
    elif title:
        parts.append(f'Role of {title}.')
    
    # skills - take first 15 to keep string manageable
    if skills:
        skill_list = [s.strip() for s in skills.split(',')][:15]
        if skill_list:
            parts.append(f'Required skills: {", ".join(skill_list)}.')
    
    # seniority with mapped experience levels
    seniority_map = {
        'intern': 'Internship level',
        'junior': 'Junior level, 1-2 years experience',
        'mid': 'Mid-level, 3-5 years experience',
        'senior': 'Senior level, 5+ years experience',
        'lead/principal': 'Principal level, 8+ years experience'
    }
    if seniority and seniority in seniority_map:
        parts.append(f'Experience level: {seniority_map[seniority]}.')
    
    # job type
    if job_type and job_type not in ['nan', 'None', '']:
        parts.append(f'Work type: {job_type}.')
    
    return ' '.join(parts) if parts else ''

# register as spark UDF
build_embedding_udf = udf(build_embedding_string, StringType())
print('Embedding UDF defined (with seniority mapping)')

Embedding UDF defined (with seniority mapping)


In [13]:
# generate embedding strings for all jobs
print('Building embedding strings')
start_time = time.time()

jobs_with_embeddings = all_jobs.withColumn(
    'embedding_text',
    build_embedding_udf(
        col('job_title'), 
        col('company'), 
        col('job_location'),
        col('skills'), 
        col('seniority'), 
        col('job_type')
    )
)

final_count = jobs_with_embeddings.count()
elapsed = time.time() - start_time

print(f'\nGenerated embeddings for {final_count:,} jobs in {elapsed:.1f}s')

Building embedding strings


                                                                                


Generated embeddings for 1,348,711 jobs in 1.5s


In [14]:
# show sample embedding strings
print('\nSample embedding strings:')
jobs_with_embeddings.filter(
    col('skills').isNotNull()
).select(
    'job_title', 'embedding_text'
).show(5, truncate=100)


Sample embedding strings:


                                                                                

+-----------------------------------------+----------------------------------------------------------------------------------------------------+
|                                job_title|                                                                                      embedding_text|
+-----------------------------------------+----------------------------------------------------------------------------------------------------+
|            Community Nurse (RMN -Band 6)|Role of Community Nurse (RMN -Band 6) at Hays in Telford, England, United Kingdom. Required skill...|
|Software Lead Engineer in Cudahy, WI, USA|Role of Software Lead Engineer in Cudahy, WI, USA at Energy Jobline in Cudahy, WI. Required skill...|
|                 Nursing Clinical Adjunct|Role of Nursing Clinical Adjunct at State of South Carolina in Beaufort, SC. Required skills: Nur...|
|                  Virtualization Engineer|Role of Virtualization Engineer at Epic in Lawrence, KS. Required skills: VMware Engine

In [15]:
# define output paths
# LinkedIn-specific output (processed with skills JOIN)
linkedin_path = os.path.join(OUTPUT_DIR, 'processed', 'linkedin')
skill_dict_path = os.path.join(OUTPUT_DIR, 'skill_dictionary')

os.makedirs(linkedin_path, exist_ok=True)
os.makedirs(skill_dict_path, exist_ok=True)

print('Output paths:')
print(f' linkedin: {linkedin_path}')
print(f' skills:  {skill_dict_path}')


Output paths:
 linkedin: /home/developer/project/output/processed/linkedin
 skills:  /home/developer/project/output/skill_dictionary


In [16]:
# save LinkedIn jobs with skills and seniority
output_file = os.path.join(linkedin_path, 'linkedin_jobs_with_skills')
print(f'Saving LinkedIn jobs to {output_file}')
jobs_with_embeddings.write.mode('overwrite').parquet(output_file)
print('Saved')

# save skill dictionary
skill_output = os.path.join(skill_dict_path, 'all_skills')
print(f'\nSaving skill dictionary to {skill_output}')
skill_counts.write.mode('overwrite').parquet(skill_output)
print('Saved')


Saving LinkedIn jobs to /home/developer/project/output/processed/linkedin/linkedin_jobs_with_skills


                                                                                

Saved

Saving skill dictionary to /home/developer/project/output/skill_dictionary/all_skills
Saved


                                                                                

In [17]:
# show seniority stats
print('Seniority Levels (extracted from job titles):')
spark.sql("""
    SELECT seniority, COUNT(*) as job_count,
           ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM jobs), 1) as percentage
    FROM jobs
    GROUP BY seniority ORDER BY job_count DESC
""").show(truncate=50)

Seniority Levels (extracted from job titles):
+--------------+---------+----------+
|     seniority|job_count|percentage|
+--------------+---------+----------+
|           mid|  1081113|      80.2|
|        senior|   150007|      11.1|
|lead/principal|    84749|       6.3|
|        junior|    31258|       2.3|
|        intern|     1584|       0.1|
+--------------+---------+----------+



In [18]:
# show job level stats
print('Job Levels:')
spark.sql("""
    SELECT job_level, COUNT(*) as job_count
    FROM jobs WHERE job_level IS NOT NULL AND job_level != '' AND job_level != 'nan'
    GROUP BY job_level ORDER BY job_count DESC
""").show(truncate=50)

Job Levels:
+----------+---------+
| job_level|job_count|
+----------+---------+
|Mid senior|  1204445|
| Associate|   144009|
+----------+---------+



In [19]:
# summary
print('SPARK PROCESSING COMPLETE')
print(f'\nTotal jobs: {total_count:,}')
print(f'Jobs with skills: {with_skills:,}')
print(f'Unique skills: {unique_skills:,}')
print(f'\nSeniority extracted from job titles (5 levels):')
print(f' - lead/principal, senior, mid, junior, intern')
print(f'\nOutputs:')
print(f' - {linkedin_path}/linkedin_jobs_with_skills/')
print(f' - {skill_dict_path}/all_skills/')


SPARK PROCESSING COMPLETE

Total jobs: 1,348,711
Jobs with skills: 1,294,374
Unique skills: 3,298,453

Seniority extracted from job titles (5 levels):
 - lead/principal, senior, mid, junior, intern

Outputs:
 - /home/developer/project/output/processed/linkedin/linkedin_jobs_with_skills/
 - /home/developer/project/output/skill_dictionary/all_skills/


In [20]:
# verify output
print('\nVerifying output')
verify_df = spark.read.parquet(os.path.join(linkedin_path, 'linkedin_jobs_with_skills'))
print(f'Verified: {verify_df.count():,} records')
print('\nSample with skills and seniority:')
verify_df.filter(
    col('skills').isNotNull()
).select(
    'job_title', 'seniority', 'skills'
).show(5, truncate=50)



Verifying output
Verified: 1,348,711 records

Sample with skills and seniority:
+--------------------------------------------------+---------+--------------------------------------------------+
|                                         job_title|seniority|                                            skills|
+--------------------------------------------------+---------+--------------------------------------------------+
|          Customer Service Operations Team Manager|      mid|Leadership, Supervision, Team building, Communi...|
|Regional Planner IV - Senior Trails and Greenwa...|   senior|Data Analysis, Budgeting, Policy Guidance, Best...|
|                   Traffic Control/Foreman Flagger|      mid|Traffic Control, Foreman Flaggers, Google Appli...|
|        Licensed Nursing Home Administrator - LNHA|      mid|Nursing Home Administrator, Financial reporting...|
|                               Recruitment Advisor|      mid|Recruitment, Candidate Attraction, Job Descript...|
+------

In [21]:
# stop spark
spark.stop()
print('\nSpark session stopped')


Spark session stopped
