In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split
from consts import JOBS_PATH, QUESTIONS_PATH, open_csv_file

# Start Spark session
spark = SparkSession.builder.appName("JobQuestionMatching").getOrCreate()

# Load datasets into Spark DataFrames
job_postings_df = open_csv_file(spark, JOBS_PATH, "all_jobpostings_with_skills.csv")
job_postings_df = job_postings_df.fillna({'skills': ''})
code_questions_df = open_csv_file(spark, QUESTIONS_PATH, "all_code_questions_with_topics.csv")
open_questions_df = open_csv_file(spark, QUESTIONS_PATH, "all_open_questions_with_topics.csv")
print("num_rows:", job_postings_df.count(), code_questions_df.count(), open_questions_df.count())

# Explode the skills and topics columns
job_postings_exploded = job_postings_df.withColumn("skill", explode(split("skills", ",")))
code_questions_exploded = code_questions_df.withColumn("topic", explode(split("topics", ",")))
open_questions_exploded = open_questions_df.withColumn("topic", explode(split("topics", ",")))

# Cartesian product between job postings and questions
cartesian_code = job_postings_exploded.crossJoin(code_questions_exploded)
cartesian_open = job_postings_exploded.crossJoin(open_questions_exploded)
print("num_rows:", cartesian_code.count(), cartesian_open.count())

In [0]:
import os
from consts import PROJECT_PATH

display(cartesian_code)
display(cartesian_open)

checkpoints_path = os.path.join(PROJECT_PATH, "tmp/spark-checkpoints/")
spark.sparkContext.setCheckpointDir(checkpoints_path)
cartesian_code = cartesian_code.checkpoint()

In [0]:
pip install sentence-transformers

In [0]:
from pyspark.sql.functions import col, lit, udf, array
from pyspark.sql.types import ArrayType, FloatType, DoubleType
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the model globally
model = SentenceTransformer('all-MiniLM-L6-v2')

# UDF to generate embeddings
@udf(ArrayType(FloatType()))
def generate_embedding(text):
    return model.encode(text).tolist()

# UDF to calculate cosine similarity
@udf(DoubleType())
def calculate_similarity(embedding1, embedding2):
    return float(cosine_similarity([embedding1], [embedding2])[0][0])

# Extract unique skills and topics
unique_skills = job_postings_exploded.select("skill").distinct()
unique_topics = code_questions_exploded.select("topic").distinct()

# Cartesian product of unique skills and topics
unique_pairs = unique_skills.crossJoin(unique_topics)

# Generate embeddings for skills and topics
unique_pairs = unique_pairs.withColumn("skill_embedding", generate_embedding(col("skill")))
unique_pairs = unique_pairs.withColumn("topic_embedding", generate_embedding(col("topic")))

# Compute similarity for unique pairs
unique_pairs = unique_pairs.withColumn(
    "similarity", calculate_similarity(col("skill_embedding"), col("topic_embedding"))
)

In [0]:
unique_pairs.display()

In [0]:
unique_pairs_pandas = unique_pairs.toPandas()
unique_pairs_pandas.to_csv("unique_pairs.csv", index=False)

In [0]:
from pyspark.sql.functions import broadcast

# Join similarity back to Cartesian product
cartesian_code_with_similarity = cartesian_code.join(
    broadcast(unique_pairs.select("skill", "topic", "similarity")),
    on=["skill", "topic"],
    how="left"
)

cartesian_open_with_similarity = cartesian_open.join(
    broadcast(unique_pairs.select("skill", "topic", "similarity")),
    on=["skill", "topic"],
    how="left"
)

# Aggregate similarity scores for each job-question pair
columns_to_group_by = [col for col in cartesian_code.columns if col not in ["skill", "topic"]]
aggregated_code_scores = cartesian_code_with_similarity.groupBy(*columns_to_group_by).agg(
    {"similarity": "avg"}
)

columns_to_group_by = [col for col in cartesian_open.columns if col not in ["skill", "topic"]]
aggregated_open_scores = cartesian_open_with_similarity.groupBy(*columns_to_group_by).agg(
    {"similarity": "avg"}
)


In [0]:
aggregated_code_scores.display()
aggregated_open_scores.display()

In [0]:
import os
from consts import DATA_PATH

aggregated_code_scores_pandas = aggregated_code_scores.toPandas()
aggregated_open_scores_pandas = aggregated_open_scores.toPandas()

aggregated_code_scores_pandas.to_csv(os.path.join(DATA_PATH, "code_questions_topis_skills_scores.csv"), index=False)
aggregated_open_scores_pandas.to_csv(os.path.join(DATA_PATH, "open_questions_topis_skills_scores.csv"), index=False)