### loading all_jobpostings_with_skills into a spark df

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, StringType
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np
import os
from consts import QUESTIONS_PATH, JOBS_PATH, open_csv_file

# Initialize Spark session
spark = SparkSession.builder.appName("SkillsTopicsScore").getOrCreate()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

job_skills_spark = open_csv_file(spark, JOBS_PATH, 'all_jobpostings_with_skills.csv')
# job_skills_spark.display()

### getting all unique skills from all_jobpostings_with_skills

In [0]:
from pyspark.sql import functions as F

# Split the string in the 'skills' column by commas and explode it
job_skills_spark_exploded = job_skills_spark.select("skills").distinct().withColumn("exploded_skills", F.explode(F.split(F.col("skills"), ",\s*")))

# Show the result
unique_skills = job_skills_spark_exploded.select("exploded_skills").distinct()
# unique_skills.display()

### getting all topics of all questions (unique)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, StringType
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np
import os
from consts import QUESTIONS_PATH, JOBS_PATH, open_csv_file

problems_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_code_questions_with_topics.csv')
# problems_spark.display()

open_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_open_questions_with_topics.csv')
# open_questions_spark.display()

questions_topics_spark = problems_spark.select(["question", "topics"]).union(open_questions_spark.select(["question", "topics"]))
# questions_topics_spark.display()


### getting all unique topics from questions_topics_spark

In [0]:
from pyspark.sql import functions as F

# Split the string in the 'topics' column by commas and explode it
questions_topics_spark_exploded = questions_topics_spark.select("topics").distinct().withColumn("exploded_topics", F.explode(F.split(F.col("topics"), ",\s*")))

# Show the result
unique_topics = questions_topics_spark_exploded.select("exploded_topics").distinct()
# unique_topics.display()

### Embeddings for each unique topic and unique skill and calculating cosine similarity between each pair of unique topic and unique skill

In [0]:
from pyspark.sql import functions as F
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Explode `skills` and `topics`
job_skills_spark_exploded = job_skills_spark.withColumn(
    "exploded_skills", F.explode(F.split(F.col("skills"), ",\\s*"))
)

questions_topics_spark_exploded = questions_topics_spark.withColumn(
    "exploded_topics", F.explode(F.split(F.col("topics"), ",\\s*"))
)

# Step 2: Collect unique skills and topics for embedding
unique_skills = [row["exploded_skills"] for row in job_skills_spark_exploded.select("exploded_skills").distinct().collect()]
unique_topics = [row["exploded_topics"] for row in questions_topics_spark_exploded.select("exploded_topics").distinct().collect()]

# Step 3: Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
skills_embeddings = model.encode(unique_skills)
topics_embeddings = model.encode(unique_topics)

# Step 4: Map skills and topics to their embeddings
skills_to_embeddings = {skill: embedding for skill, embedding in zip(unique_skills, skills_embeddings)}
topics_to_embeddings = {topic: embedding for topic, embedding in zip(unique_topics, topics_embeddings)}

# Step 5: Define a UDF to calculate similarity
def calculate_similarity(topic, skill):
    topic_emb = topics_to_embeddings.get(topic) # topics_to_embeddings[topic]
    skill_emb = skills_to_embeddings.get(skill)
    if topic_emb is not None and skill_emb is not None:
        return float(cosine_similarity([topic_emb], [skill_emb])[0][0])
    return None

similarity_udf = F.udf(calculate_similarity, "double")

# Step 6: Cross join the exploded DataFrames and calculate similarity
cross_joined = questions_topics_spark_exploded.crossJoin(job_skills_spark_exploded)
similarity_scores = cross_joined.withColumn(
    "similarity_score", similarity_udf(F.col("exploded_topics"), F.col("exploded_skills"))
)

# Step 7: Average similarity scores for each topic and job_title
topic_job_avg = similarity_scores.groupBy("exploded_topics", "job_title").agg(
    F.avg("similarity_score").alias("avg_similarity_per_topic")
)

# Step 8: Average topic scores for each question and job_title
# Join back with questions to map topics to questions
question_topic_mapping = questions_topics_spark_exploded.select("question", "exploded_topics").distinct()

question_job_avg = topic_job_avg.join(
    question_topic_mapping, on="exploded_topics"
).groupBy("question", "job_title").agg(
    F.avg("avg_similarity_per_topic").alias("avg_similarity_per_question")
)
question_job_avg.display()
# Step 9: Show the results
# question_job_avg.show(truncate=False)

  --make sure it embedds each skill and each topic when bringing a list into an embedder

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Explode `skills` and `topics`
job_skills_spark_exploded = job_skills_spark.withColumn(
    "exploded_skills", F.explode(F.split(F.col("skills"), ",\\s*"))
)

questions_topics_spark_exploded = questions_topics_spark.withColumn(
    "exploded_topics", F.explode(F.split(F.col("topics"), ",\\s*"))
)

# Step 2: Collect unique skills and topics for embedding
unique_skills = [row["exploded_skills"] for row in job_skills_spark_exploded.select("exploded_skills").distinct().collect()]
unique_topics = [row["exploded_topics"] for row in questions_topics_spark_exploded.select("exploded_topics").distinct().collect()]

# Step 3: Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
skills_embeddings = model.encode(unique_skills)
topics_embeddings = model.encode(unique_topics)

# Step 4: Compute similarity in Python
skills_df = pd.DataFrame({"skill": unique_skills, "embedding": list(skills_embeddings)})
topics_df = pd.DataFrame({"topic": unique_topics, "embedding": list(topics_embeddings)})

# Calculate similarity scores
similarity_records = []
for _, topic_row in topics_df.iterrows():
    for _, skill_row in skills_df.iterrows():
        score = float(cosine_similarity([topic_row["embedding"]], [skill_row["embedding"]])[0][0])
        similarity_records.append((topic_row["topic"], skill_row["skill"], score))

similarity_df = pd.DataFrame(similarity_records, columns=["topic", "skill", "similarity_score"])

# Step 5: Create a Spark DataFrame from similarity_df
similarity_spark = spark.createDataFrame(similarity_df)

# Step 6: Aggregate scores
# Join similarity scores with the exploded DataFrames
# taking an average of the similarity scores over all skills of a job for each topic
job_skills_mapping = job_skills_spark_exploded.select("job_title", "exploded_skills").distinct()

topic_job_avg = similarity_spark.join(
    job_skills_mapping,
    similarity_spark["skill"] == job_skills_mapping["exploded_skills"]
).groupBy("topic", "job_title").agg(
    F.avg("similarity_score").alias("avg_job_similarity_per_topic")
)

question_topic_mapping = questions_topics_spark_exploded.select("question", "exploded_topics").distinct()

question_job_avg = topic_job_avg.join(
    question_topic_mapping, topic_job_avg["topic"] == question_topic_mapping["exploded_topics"]
).groupBy("question", "job_title").agg(
    F.avg("avg_job_similarity_per_topic").alias("avg_job_similarity_per_question")
)

# Step 7: Display results
question_job_avg.display(truncate=False)

### trying on a smaller subset of job_skills_spark and questions_topics_spark

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("SubsetExample").getOrCreate()

# Create a smaller subset of job_skills_spark
job_skills_data_subset = [
    ("Data Scientist", "Machine Learning, Python"),
    ("Software Engineer", "Python, Java"),
]
job_skills_spark_subset = spark.createDataFrame(job_skills_data_subset, ["job_title", "skills"])

# Create a smaller subset of questions_topics_spark
questions_topics_data_subset = [
    ("What is Python used for?", "Python, Programming"),
    ("How does machine learning work?", "Machine Learning, AI"),
]
questions_topics_spark_subset = spark.createDataFrame(questions_topics_data_subset, ["question", "topics"])

# Explode skills and topics
job_skills_spark_exploded = job_skills_spark_subset.withColumn(
    "exploded_skills", F.explode(F.split(F.col("skills"), ",\\s*"))
)

questions_topics_spark_exploded = questions_topics_spark_subset.withColumn(
    "exploded_topics", F.explode(F.split(F.col("topics"), ",\\s*"))
)



# Step 2: Collect unique skills and topics for embedding
unique_skills = [row["exploded_skills"] for row in job_skills_spark_exploded.select("exploded_skills").distinct().collect()]
unique_topics = [row["exploded_topics"] for row in questions_topics_spark_exploded.select("exploded_topics").distinct().collect()]

# Step 3: Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
skills_embeddings = model.encode(unique_skills)
topics_embeddings = model.encode(unique_topics)

# Step 4: Compute similarity in Python
skills_df = pd.DataFrame({"skill": unique_skills, "embedding": list(skills_embeddings)})
topics_df = pd.DataFrame({"topic": unique_topics, "embedding": list(topics_embeddings)})

# Calculate similarity scores
similarity_records = []
for _, topic_row in topics_df.iterrows():
    for _, skill_row in skills_df.iterrows():
        score = float(cosine_similarity([topic_row["embedding"]], [skill_row["embedding"]])[0][0])
        similarity_records.append((topic_row["topic"], skill_row["skill"], score))

similarity_df = pd.DataFrame(similarity_records, columns=["topic", "skill", "similarity_score"])

# Step 5: Create a Spark DataFrame from similarity_df
similarity_spark = spark.createDataFrame(similarity_df)

# Step 6: Aggregate scores
# Join similarity scores with the exploded DataFrames
topic_job_avg = similarity_spark.join(
    job_skills_spark_exploded,
    similarity_spark["skill"] == job_skills_spark_exploded["exploded_skills"]
).groupBy("topic", "job_title").agg(
    F.avg("similarity_score").alias("avg_similarity_per_topic")
)

question_topic_mapping = questions_topics_spark_exploded.select("question", "exploded_topics").distinct()

question_job_avg = topic_job_avg.join(
    question_topic_mapping, topic_job_avg["topic"] == question_topic_mapping["exploded_topics"]
).groupBy("question", "job_title").agg(
    F.avg("avg_similarity_per_topic").alias("avg_similarity_per_question")
)

# Step 7: Display results
question_job_avg.show(truncate=False)


# # Show the smaller datasets
# print("Job Skills Spark Exploded (Subset):")
# job_skills_spark_exploded.show(truncate=False)

# print("Questions Topics Spark Exploded (Subset):")
# questions_topics_spark_exploded.show(truncate=False)

# draft

In [0]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from datasets import Dataset
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

skills = [row["exploded_skills"] for row in unique_skills.collect()]
topics = [row["exploded_topics"] for row in unique_topics.collect()]

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and efficient

# Generate embeddings for topics and skills
skills_embeddings = model.encode(skills)
topics_embeddings = model.encode(topics)

# Calculate compatibility scores
compatibility_scores = cosine_similarity(topics_embeddings, skills_embeddings)

# print(compatibility_scores)

# Display scores
for i, topic in enumerate(topics):
    print(f"Topic: {topic}")
    for j, skill in enumerate(skills):
        print(f"  Skill: {skill} -> Score: {compatibility_scores[i][j]:.2f}")

In [0]:

job_skills_spark
questions_topics_spark
# Map topics to their index in the topics list
topic_to_index = {topic: idx for idx, topic in enumerate(topics)}

# Initialize a dictionary to store the final scores
question_scores = defaultdict(float)

# Calculate the average score for each question
for question, question_topics in question_topics_mapping.items():
    topic_scores = []
    for topic in question_topics:
        if topic in topic_to_index:  # Ensure the topic exists in the topics list
            topic_idx = topic_to_index[topic]
            # Average the topic's score over all skills
            avg_topic_score = compatibility_scores[topic_idx].mean()
            topic_scores.append(avg_topic_score)
    # Calculate the overall average score for the question
    if topic_scores:  # Avoid division by zero
        question_scores[question] = sum(topic_scores) / len(topic_scores)

# Display the results
for question, avg_score in question_scores.items():
    print(f"Question: {question} -> Average Compatibility Score: {avg_score:.2f}")

In [0]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from datasets import Dataset
import torch

# 1. Define Your Data
# Example labeled data: each entry has a question, a skill, and a compatibility label
# data = [
#     {"question": "Sorting and searching algorithms", "skill": "Sorting algorithms", "label": 1.0},
#     {"question": "Graph traversal", "skill": "Sorting algorithms", "label": 0.0},
#     {"question": "Dynamic programming on trees", "skill": "Graph theory", "label": 1.0},
#     {"question": "Binary search optimization", "skill": "Data structures", "label": 0.5},
#     {"question": "Tree traversal", "skill": "Graph theory", "label": 1.0},
# ]


# Collect the topics into a list
skills = [row["exploded_skills"] for row in unique_skills.collect()]
topics = [row["exploded_topics"] for row in unique_topics.collect()]

# # test data with labeling option
# data = []
# for topic in topics:
#     for skill in skills:
#         # For demonstration purposes, use a dummy label. You should replace this with actual label logic.
#         label = 1.0 if topic in skill else 0.0  # Example: if topic is in skill, label it as 1.0
#         data.append({"question": topic, "skill": skill, "label": label})

data = []
for topic in topics:
    for skill in skills:
        data.append({"topic": topic, "skill": skill})

# Convert to Dataset for easy manipulation
dataset = Dataset.from_list(data)

# 2. Convert Dataset to Sentence Transformers InputExample Format
train_examples = [
    InputExample(texts=[row["topic"], row["skill"]])
    for row in data
]

# 3. Load Pre-trained Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and effective for semantic similarity

# 4. Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# 5. Define Loss Function
train_loss = losses.CosineSimilarityLoss(model)

# 6. Fine-Tune the Model
print("Starting fine-tuning...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,  # Number of epochs for training
    warmup_steps=100,  # Warmup steps for learning rate scheduler
    show_progress_bar=True
)

# Save the fine-tuned model
model_save_path = "fine_tuned_question_skill_model"
model.save(model_save_path)
print(f"Model fine-tuned and saved to {model_save_path}")

# 7. Evaluate the Model
# Test data for evaluation
test_data = [
    {"question": "Graph traversal", "skill": "Graph theory", "label": 1.0},
    {"question": "Sorting algorithms", "skill": "Graph theory", "label": 0.0},
    {"question": "Dynamic programming", "skill": "Data structures", "label": 0.5},
]

# Convert test data to format for predictions
test_examples = [(row["question"], row["skill"]) for row in test_data]
test_labels = [row["label"] for row in test_data]

# Get predictions and display results
print("\nEvaluating the fine-tuned model...")
for i, (question, skill) in enumerate(test_examples):
    question_embedding = model.encode(question)
    skill_embedding = model.encode(skill)
    similarity_score = torch.nn.functional.cosine_similarity(
        torch.tensor(question_embedding), torch.tensor(skill_embedding), dim=0
    ).item()
    print(f"Question: '{question}' | Skill: '{skill}' | Predicted Score: {similarity_score:.2f} | True Label: {test_labels[i]}")

# The predicted scores should align more closely with the labels after fine-tuning.


In [0]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Example data
questions = ["Dynamic programming on trees", "Sorting and searching algorithms", "Graph traversal"]
skills = ["Graph theory", "Sorting algorithms", "Data structures"]

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and efficient

# Generate embeddings for questions and skills
questions_embeddings = model.encode(questions)
skills_embeddings = model.encode(skills)

# Calculate compatibility scores
compatibility_scores = cosine_similarity(questions_embeddings, skills_embeddings)

# Display scores
for i, question in enumerate(questions):
    print(f"Question: {question}")
    for j, skill in enumerate(skills):
        print(f"  Skill: {skill} -> Score: {compatibility_scores[i][j]:.2f}")

### getting all unique topics from all_code_questions_with_topics and all_open_questions_with_topics
* NO NEED

In [0]:
from pyspark.sql import functions as F

# Split the string in the 'topics' column by commas and explode it
problems_spark_exploded = problems_spark.select("topics").distinct().withColumn("exploded_topics", F.explode(F.split(F.col("topics"), ",\s*")))

# Show the result
unique_topics_problems = problems_spark_exploded.select("exploded_topics").distinct()
unique_topics_problems.display()

# Split the string in the 'topics' column by commas and explode it
open_questions_spark_exploded = open_questions_spark.select("topics").distinct().withColumn("exploded_topics", F.explode(F.split(F.col("topics"), ",\s*")))

# Show the result
unique_topics_open_questions = open_questions_spark_exploded.select("exploded_topics").distinct()
unique_topics_open_questions.display()

unique_topics = unique_topics_problems.union(unique_topics_open_questions)
unique_topics.display()