In [0]:
df_silver = spark.table("resume_silver")
df_silver.select("resume_file", "extracted_skills").show(truncate=False)


+-----------------------+-------------------------------------------------------------------------------+
|resume_file            |extracted_skills                                                               |
+-----------------------+-------------------------------------------------------------------------------+
|Resume_Akash_Verma.pdf |[python, sql, pyspark, databricks, spark, etl]                                 |
|Resume_Ananya_Gupta.pdf|[python, sql, machine learning, nlp, spacy]                                    |
|Resume_Karthik_R.pdf   |[python, sql]                                                                  |
|Resume_Rohan_Sharma.pdf|[python, sql, machine learning, nlp, pyspark, databricks, spark, data analysis]|
|Resume_Sneha_Iyer.pdf  |[python, sql, machine learning, pandas, data analysis]                         |
+-----------------------+-------------------------------------------------------------------------------+



In [0]:
jd_path = "/Volumes/resume/data/volume/jd.txt"

with open(jd_path, "r") as f:
    jd_text = f.read().lower()

jd_text


'\ufeffwe are looking for an ai/ml intern with strong fundamentals in python and machine learning.\nthe candidate should have experience with sql, data analysis, and nlp techniques.\nhands-on exposure to pyspark and databricks is a plus.\nfamiliarity with etl pipelines, data preprocessing, and basic deep learning concepts is desirable.'

In [0]:
skills_list = [
    "python",
    "sql",
    "machine learning",
    "deep learning",
    "nlp",
    "pyspark",
    "databricks",
    "spark",
    "pandas",
    "numpy",
    "spacy",
    "etl",
    "data analysis"
]

jd_skills = [skill for skill in skills_list if skill in jd_text]
jd_skills


['python',
 'sql',
 'machine learning',
 'deep learning',
 'nlp',
 'pyspark',
 'databricks',
 'spark',
 'etl',
 'data analysis']

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

def score_resume(resume_skills):
    matched = list(set(resume_skills) & set(jd_skills))
    missing = list(set(jd_skills) - set(resume_skills))

    score = int((len(matched) / len(jd_skills)) * 100) if jd_skills else 0

    return score, ", ".join(matched), ", ".join(missing)

score_schema = StructType([
    StructField("score", IntegerType()),
    StructField("matched_skills", StringType()),
    StructField("missing_skills", StringType())
])

score_udf = udf(score_resume, score_schema)


In [0]:
df_gold = df_silver.withColumn(
    "score_struct",
    score_udf("extracted_skills")
)

df_gold = df_gold \
    .withColumn("score", df_gold.score_struct.score) \
    .withColumn("matched_skills", df_gold.score_struct.matched_skills) \
    .withColumn("missing_skills", df_gold.score_struct.missing_skills) \
    .drop("score_struct")

df_gold.select(
    "resume_file",
    "score",
    "matched_skills",
    "missing_skills"
).show(truncate=False)


+-----------------------+-----+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------+
|resume_file            |score|matched_skills                                                               |missing_skills                                                                      |
+-----------------------+-----+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------+
|Resume_Akash_Verma.pdf |60   |sql, pyspark, databricks, etl, python, spark                                 |machine learning, data analysis, deep learning, nlp                                 |
|Resume_Ananya_Gupta.pdf|40   |machine learning, sql, python, nlp                                           |data analysis, deep learning, pyspark, databricks, etl, spark                       |
|Resume_Karthik_R.pdf   |

In [0]:
df_ranked = df_gold.orderBy(df_gold.score.desc())
df_ranked.show(truncate=False)


+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-----+-----------------------------------------------------------------------------+-----------------------------

In [0]:
df_ranked.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("resume_gold")


In [0]:
df_ranked \
    .select("resume_file", "score", "matched_skills", "missing_skills") \
    .coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("/Volumes/resume/data/volume/output/candidate_ranking")
