In [0]:
df_raw = spark.read.parquet(
    "/Volumes/workspace/default/raw_job_postings/0000.parquet"
)

# print(df_raw.count())
df_raw.printSchema()

In [0]:
df_raw.count()

In [0]:
from pyspark.sql.functions import col

df_core = df_raw.select(
    col("job_id"),
    col("title"),
    col("description"),
    col("skills_desc")
)

# df_core.display()

In [0]:
from pyspark.sql.functions import sum as spark_sum

df_core.select(
    spark_sum(col("job_id").isNull().cast("int")).alias("job_id_null_count"),
    spark_sum(col("title").isNull().cast("int")).alias("title_null_count"),
    spark_sum(col("description").isNull().cast("int")).alias("description_count"),
    spark_sum(col("skills_desc").isNull().cast("int")).alias("skills_desc_count")
).show()

In [0]:
df_non_null = df_core.filter(
    col("job_id").isNotNull() &
    col("title").isNotNull() &
    col("description").isNotNull()
)

In [0]:
print("Before:", df_core.count())
print("After:", df_non_null.count())

In [0]:
duplicate_groups = (
    df_non_null
    .groupBy("title", "description")
    .count()
    .filter(col("count")>1)
    .count()
)
duplicate_groups

In [0]:
df_dedup = df_non_null.dropDuplicates(["title", "description"])

In [0]:
print("After Dedup count:", df_dedup.count())

In [0]:
df_dedup.show(5)

In [0]:
# Role Filter for Data Engineer
from pyspark.sql.functions import lower

df_de = df_dedup.filter(
    lower(col("title")).rlike(
        "data engineer|analytics engineer|big data engineer|data platform engineer"
    )
)

In [0]:
df_de.count()

In [0]:
df_de.select("title").distinct().show(20)

In [0]:
display(df_de.select("description").limit(5))

In [0]:
df_de = df_de.select(
    "job_id",
    "title",
    "description",
    "skills_desc"
)


In [0]:
from pyspark.sql.functions import regexp_replace

df_text = df_de.withColumn(
    "clean_text",
    lower(col("description"))
)

df_text = df_text.withColumn(
    "clean_text",
    regexp_replace("clean_text", r"http\S+|www\S+", " ")
)

df_text = df_text.withColumn(
    "clean_text",
    regexp_replace("clean_text", r"[^a-z0-9+\.\# ]", " ")
)

df_text = df_text.withColumn(
    "clean_text",
    regexp_replace("clean_text", r"\s+", " ")
)

In [0]:
df_text.select("clean_text").display()

In [0]:
from pyspark.sql.functions import length

df_text.select(
    length("clean_text").alias("text_length")
).summary().show()


In [0]:
df_nlp = df_text.select(
    "job_id",
    "title",
    "clean_text",
    "description"
)

In [0]:
data_engineer_skills = [
    # languages
    "python", "sql", "java", "scala",

    # big data
    "spark", "pyspark", "hadoop", "hive", "kafka",

    # cloud
    "aws", "azure", "gcp", "s3", "redshift", "bigquery",

    # orchestration / etl
    "airflow", "dbt", "etl", "elt",

    # databases
    "postgres", "mysql", "snowflake", "databricks",

    # formats / tools
    "parquet", "delta", "delta lake",

    # misc
    "ci/cd", "git", "docker", "kubernetes"
]

In [0]:
import re

skill_patterns = {
    skill: re.compile(rf"\b{re.escape(skill)}\b")
    for skill in data_engineer_skills
}


In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

def extract_skills(text):
    found = []
    if text:
        for skill, pattern in skill_patterns.items():
            if pattern.search(text):
                found.append(skill)
    return found

extract_skills_udf = udf(extract_skills, ArrayType(StringType()))


In [0]:
df_skills = df_nlp.withColumn(
    "extracted_skills",
    extract_skills_udf("clean_text")
)


In [0]:
df_skills.select("title", "extracted_skills").show(5, truncate=False)


In [0]:
from pyspark.sql.functions import explode, round

skill_counts = (
    df_skills
    .select(explode("extracted_skills").alias("skill"))
    .groupBy("skill")
    .count()
    .orderBy("count", ascending=False)
)


In [0]:
skill_counts.show(20, truncate=False)


In [0]:
total_jobs = df_skills.count()

skill_stats = skill_counts.withColumn(
    "percentage",
    round((col("count") / total_jobs) * 100,1)
)

skill_stats.show(20, truncate=False)


In [0]:
skill_stats.orderBy("percentage", ascending=False).show(15, truncate=False)
