In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split
from consts import DATA_PATH, QUESTIONS_PATH, open_csv_file

# Start Spark session
spark = SparkSession.builder.appName("JobQuestionMatching").getOrCreate()

# Load datasets into Spark DataFrames
job_postings = open_csv_file(spark, DATA_PATH, "train_jobs_data.csv")
job_postings = job_postings.fillna({'skills': ''})
code_questions = open_csv_file(spark, QUESTIONS_PATH, "all_code_questions_with_topics.csv")
open_questions = open_csv_file(spark, QUESTIONS_PATH, "all_open_questions_with_topics.csv")
print("num_rows:", job_postings.count(), code_questions.count(), open_questions.count())

# Explode the skills and topics columns
job_postings_exploded = job_postings.withColumn("skill", explode(split("skills", ",")))
code_questions_exploded = code_questions.withColumn("topic", explode(split("topics", ",")))
open_questions_exploded = open_questions.withColumn("topic", explode(split("topics", ",")))

# Cartesian product between job postings and questions
cartesian_code = job_postings_exploded.crossJoin(code_questions_exploded)
cartesian_open = job_postings_exploded.crossJoin(open_questions_exploded)
print("num_rows:", cartesian_code.count(), cartesian_open.count())

In [0]:
import os
from consts import PROJECT_PATH

display(cartesian_code)
display(cartesian_open)

# checkpoints_path = "dbfs:/tmp/spark-checkpoints/"
# spark.sparkContext.setCheckpointDir(checkpoints_path)
# cartesian_code = cartesian_code.checkpoint()
# cartesian_open = cartesian_open.checkpoint()

In [0]:
pip install sentence-transformers

In [0]:
from pyspark.sql.functions import col, lit, udf, array
from pyspark.sql.types import ArrayType, FloatType, DoubleType
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the model globally
model = SentenceTransformer('all-MiniLM-L6-v2')

# UDF to generate embeddings
@udf(ArrayType(FloatType()))
def generate_embedding(text):
    return model.encode(text).tolist()

# UDF to calculate cosine similarity
@udf(DoubleType())
def calculate_similarity(embedding1, embedding2):
    return float(cosine_similarity([embedding1], [embedding2])[0][0])

# Extract unique skills and topics
unique_skills = job_postings_exploded.select("skill").distinct()
unique_topics_code = code_questions_exploded.select("topic").distinct()
unique_topics_open = open_questions_exploded.select("topic").distinct()

# # Cartesian product of unique skills and topics
# unique_pairs_code = unique_skills.crossJoin(unique_topics_code)
# unique_pairs_open = unique_skills.crossJoin(unique_topics_open)

# # Generate embeddings for skills and topics
# unique_pairs_code = unique_pairs_code.withColumn("skill_embedding", generate_embedding(col("skill")))
# unique_pairs_open = unique_pairs_open.withColumn("skill_embedding", generate_embedding(col("skill")))
# unique_pairs_code = unique_pairs_code.withColumn("topic_embedding", generate_embedding(col("topic")))
# unique_pairs_open = unique_pairs_open.withColumn("topic_embedding", generate_embedding(col("topic")))

# # Compute similarity for unique pairs
# unique_pairs_code = unique_pairs_code.withColumn(
#     "similarity", calculate_similarity(col("skill_embedding"), col("topic_embedding"))
# )
# unique_pairs_open = unique_pairs_open.withColumn(
#     "similarity", calculate_similarity(col("skill_embedding"), col("topic_embedding"))
# )

In [0]:
unique_skills_df = unique_skills.toPandas()
unique_topics_code_df = unique_topics_code.toPandas()
unique_topics_open_df = unique_topics_open.toPandas()

unique_skills_df["embedding"] = unique_skills_df["skill"].apply(model.encode)
unique_topics_code_df["embedding"] = unique_topics_code_df["topic"].apply(model.encode)
unique_topics_open_df["embedding"] = unique_topics_open_df["topic"].apply(model.encode)

unique_pairs_code_df = unique_skills_df.merge(unique_topics_code_df, how="cross")  # Cartesian product
unique_pairs_open_df = unique_skills_df.merge(unique_topics_open_df, how="cross")

unique_pairs_code_df["similarity"] = unique_pairs_code_df.apply(
    lambda row: cosine_similarity([row["embedding_x"]], [row["embedding_y"]])[0][0], axis=1
)
unique_pairs_open_df["similarity"] = unique_pairs_open_df.apply(
    lambda row: cosine_similarity([row["embedding_x"]], [row["embedding_y"]])[0][0], axis=1
)

unique_pairs_code_df.to_csv("unique_pairs_with_similarity_code.csv", index=False)
unique_pairs_open_df.to_csv("unique_pairs_with_similarity_open.csv", index=False)

In [0]:
display(unique_pairs_code_df)
display(unique_pairs_open_df)

unique_pairs_code = 

In [0]:
unique_pairs_code.checkpoint()
unique_pairs_open.checkpoint()

unique_pairs_code.display()
unique_pairs_open.display()

In [0]:
from pyspark.sql.functions import broadcast

# Join similarity back to Cartesian product
cartesian_code_with_similarity = cartesian_code.join(
    broadcast(unique_pairs_code.select("skill", "topic", "similarity")),
    on=["skill", "topic"],
    how="left"
)

cartesian_open_with_similarity = cartesian_open.join(
    broadcast(unique_pairs_open.select("skill", "topic", "similarity")),
    on=["skill", "topic"],
    how="left"
)

# Aggregate similarity scores for each job-question pair
columns_to_group_by = [col for col in cartesian_code.columns if col not in ["skill", "topic"]]
aggregated_code_scores = cartesian_code_with_similarity.groupBy(*columns_to_group_by).agg(
    {"similarity": "avg"}
)

columns_to_group_by = [col for col in cartesian_open.columns if col not in ["skill", "topic"]]
aggregated_open_scores = cartesian_open_with_similarity.groupBy(*columns_to_group_by).agg(
    {"similarity": "avg"}
)

aggregated_code_scores.checkpoint()
aggregated_open_scores.checkpoint()

In [0]:
aggregated_code_scores.display()
aggregated_open_scores.display()