In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, StringType
from scipy.spatial.distance import cosine
import numpy as np
import os
from consts import QUESTIONS_PATH, JOBS_PATH, open_csv_file

# Initialize Spark session
spark = SparkSession.builder.appName("InterviewQuestionSelector").getOrCreate()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [0]:
# Load datasets into Spark DataFrames
job_postings = open_csv_file(spark, JOBS_PATH, 'all_jobpostings.csv')
code_questions = open_csv_file(spark, QUESTIONS_PATH, 'all_code_problems_with_solutions.csv')
open_questions = open_csv_file(spark, QUESTIONS_PATH, 'all_open_questions.csv')

# Preprocessing function to handle missing values and ensure string type
def preprocess_column_spark(df, column):
    df = df.withColumn(column, col(column).cast("string"))
    df = df.fillna({column: ""})
    return df

# Preprocess columns in the datasets
job_postings = preprocess_column_spark(job_postings, 'job_summary')
code_questions = preprocess_column_spark(code_questions, 'topics')
open_questions = preprocess_column_spark(open_questions, 'question')

In [0]:
pip install -q -U google-generativeai

## Job postings: filling in missing skills

In [0]:
from pyspark.sql.functions import udf, concat_ws
from pyspark.sql.types import ArrayType, StringType
import ast

# Define a UDF to safely parse the string to a list
def parse_skills(skills_str):
    try:
        return ast.literal_eval(skills_str)
    except (ValueError, SyntaxError):
        return []

parse_skills_udf = udf(parse_skills, ArrayType(StringType()))

# Apply the UDF to create a proper list column
job_postings = job_postings.withColumn("skills_list", parse_skills_udf("skills"))

# Convert the skills list to a single string
job_postings = job_postings.withColumn("skills_string", concat_ws(", ", "skills_list")) \
    .drop("skills", "skills_list").withColumnRenamed("skills_string", "skills")

In [0]:
import pandas as pd

# Convert dataset to pandas
job_postings_pandas = job_postings.toPandas()
empty_skills_count = job_postings_pandas[job_postings_pandas['skills'] == ''].shape[0]
print(empty_skills_count)

In [0]:
import google.generativeai as genai
import os
import time
from api_keys import API_KEYS

def infer_skills(job_summary):
    """
    Extracts skills from a job description using the Gemini model.

    Args:
        job_summary: The job description text.

    Returns:
        A comma-separated string of skills extracted from the job description.
    """

    if pd.isna(job_summary) or str(job_summary).strip() == '':
        return ''

    prompt = f"Infer a comma-separated list of skills required for the following job description:\n{job_summary}"

    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        return ''
    
start_time = time.time()
running_time = 0
empty_skills_rows = job_postings_pandas[(job_postings_pandas['skills'] == '') & (job_postings_pandas['job_summary'].str.strip() != '')]

while (running_time < 3600) and (empty_skills_rows.shape[0] > 0):
    for api_key in API_KEYS.values():
        # Configure Gemini API
        os.environ['GOOGLE_API_KEY'] = api_key
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
        model = genai.GenerativeModel('gemini-1.5-flash')

        # Filter the rows where the "skills" column is empty
        empty_skills_rows = job_postings_pandas[(job_postings_pandas['skills'] == '') & (job_postings_pandas['job_summary'].str.strip() != '')]
        if empty_skills_rows.shape[0] == 0:
            break

        # Get the indices of the first 15 rows with empty "skills"
        indices_to_update = empty_skills_rows.index[:15]

        # Apply the UDF only to the selected rows
        job_postings_pandas.loc[indices_to_update, 'skills'] = (
            job_postings_pandas.loc[indices_to_update, 'job_summary']
                .apply(infer_skills)
        )
    running_time = time.time() - start_time

In [0]:
job_postings_pandas['skills'] = job_postings_pandas['skills'].fillna('')
empty_skills_count = job_postings_pandas[job_postings_pandas['skills'] == ''].shape[0]
job_postings_pandas['job_summary'] = job_postings_pandas['job_summary'].fillna('')
empty_job_summaries = job_postings_pandas[job_postings_pandas['job_summary'].str.strip() == ''].shape[0]
print("empty strings:", empty_skills_count)
print("empty job summaries:", empty_job_summaries)

In [0]:
from consts import JOBS_PATH
import os

job_postings_with_skills = spark.createDataFrame(job_postings_pandas)
job_postings_pandas.to_csv(os.path.join(JOBS_PATH, 'all_jobpostings_with_skills.csv'), index=False)

## Code questions: filling in missing topics

In [0]:
from pyspark.sql.functions import regexp_replace, concat_ws, split, col, expr

# Format the topics column to be a string containing comma-separated topics.
code_questions = code_questions.withColumn("topics_array", split(col("topics"), ", ")) \
    .withColumn("topics_array_cleaned", expr("transform(topics_array, x -> regexp_replace(x, \"'\", \"\"))")) \
    .withColumn("topics_formatted", concat_ws(", ", col("topics_array_cleaned"))) \
    .drop("topics_array", "topics_array_cleaned", "topics").withColumnRenamed("topics_formatted", "topics")

code_questions_pandas = code_questions.toPandas()

In [0]:
import google.generativeai as genai
import os
import time
from api_keys import API_KEYS

def extract_topics_from_question(question):
   if pd.isna(question) or question.strip() == '':
       return ''
   
   prompt = f"Analyze the following question and identify the specific skills being tested or evaluated. Return the skills as a comma-separated list of skills. If the question does not test any skills, return an empty string. Question: {question}"
   
   try:
       response = model.generate_content(prompt)
       skills = response.text.strip()
       return skills if skills else ''
   except Exception as e:
       return ''
   

start_time = time.time()
running_time = 0
empty_topics_rows = code_questions_pandas[(code_questions_pandas['topics'] == '') & (code_questions_pandas['question'].str.strip() != '')]

while (running_time < 900) and (empty_topics_rows.shape[0] > 0):
    for api_key in API_KEYS.values():
        # Configure Gemini API
        os.environ['GOOGLE_API_KEY'] = api_key
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
        model = genai.GenerativeModel('gemini-1.5-flash')

        # Filter the rows where the "topics" column is empty
        empty_topics_rows = code_questions_pandas[(code_questions_pandas['topics'] == '') & (code_questions_pandas['question'].str.strip() != '')]
        if empty_topics_rows.shape[0] == 0:
            break

        # Get the indices of the first 15 rows with empty "topics"
        indices_to_update = empty_topics_rows.index[:15]

        # Apply the UDF only to the selected rows
        code_questions_pandas.loc[indices_to_update, 'topics'] = (
            code_questions_pandas.loc[indices_to_update, 'question']
                .apply(extract_topics_from_question)
        )
    running_time = time.time() - start_time

In [0]:
empty_topics_count = code_questions_pandas[code_questions_pandas['topics'].isna()].shape[0]
print("nulls:", empty_topics_count)
code_questions_pandas['topics'] = code_questions_pandas['topics'].fillna('')
empty_topics_count = code_questions_pandas[code_questions_pandas['topics'] == ''].shape[0]
code_questions_pandas['question'] = code_questions_pandas['question'].fillna('')
empty_questions = code_questions_pandas[code_questions_pandas['question'].str.strip() == ''].shape[0]
print("empty strings:", empty_topics_count)
print("empty questions:", empty_questions)

In [0]:
from consts import QUESTIONS_PATH
import os

code_questions_with_topics = spark.createDataFrame(code_questions_pandas)
code_questions_pandas.to_csv(os.path.join(QUESTIONS_PATH, 'all_code_questions_with_topics.csv'), index=False)

In [0]:
code_questions_with_topics.display()

## Open questions: filling in missing topics

In [0]:
import pandas as pd

# Convert dataset to pandas
open_questions_pandas = open_questions.toPandas()
open_questions_pandas['topics'] = ''

In [0]:
import google.generativeai as genai
import os
from api_keys import API_KEYS
   
start_time = time.time()
running_time = 0
empty_topics_rows = open_questions_pandas[(open_questions_pandas['topics'] == '') & (open_questions_pandas['question'].str.strip() != '')]

while (running_time < 900) and (empty_topics_rows.shape[0] > 0):
    for api_key in API_KEYS.values():
        # Configure Gemini API
        os.environ['GOOGLE_API_KEY'] = api_key
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
        model = genai.GenerativeModel('gemini-1.5-flash')

        empty_topics_rows = open_questions_pandas[(open_questions_pandas['topics'] == '') & (open_questions_pandas['question'].str.strip() != '')]
        if empty_topics_rows.shape[0] == 0:
            break

        # Get the indices of the first 15 rows with empty "topics"
        indices_to_update = empty_topics_rows.index[:15]

        # Apply the UDF only to the selected rows
        open_questions_pandas.loc[indices_to_update, 'topics'] = (
            open_questions_pandas.loc[indices_to_update, 'question']
                .apply(extract_topics_from_question)
        )
    running_time = time.time() - start_time

In [0]:
from consts import QUESTIONS_PATH
import os

open_questions_with_topics = spark.createDataFrame(open_questions_pandas)
open_questions_pandas.to_csv(os.path.join(QUESTIONS_PATH, 'all_open_questions_with_topics.csv'), index=False)

In [0]:
open_questions_with_topics.display()

# Demonstrating the model on 50 random jobs

In [0]:
import pandas as pd
import os
from consts import JOBS_PATH, DATA_PATH

# Load the CSV file
jobs_data = pd.read_csv(os.path.join(JOBS_PATH, "all_jobpostings_with_skills.csv"))

# Perform train-test split (e.g., 80-20 split)
jobs_sample = jobs_data.sample(n=50, random_state=42)

# Save the train and test sets to separate files
jobs_sample.to_csv(os.path.join(JOBS_PATH, "jobs_sample.csv"), index=False)

## Topics & skills embeddings

In [0]:
pip install sentence-transformers

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col, lit, udf, array, broadcast
from pyspark.sql.types import ArrayType, FloatType, DoubleType
from sentence_transformers import SentenceTransformer
from calculate_heuristic_score import calculate_score
from consts import DATA_PATH, QUESTIONS_PATH, MID_CALC_PATH, open_csv_file

jobs_sample = open_csv_file(spark, JOBS_PATH, "jobs_sample.csv")
code_questions = open_csv_file(spark, QUESTIONS_PATH, "all_code_questions_with_topics.csv")
open_questions = open_csv_file(spark, QUESTIONS_PATH, "all_open_questions_with_topics.csv")

code_questions_exploded = code_questions.withColumn("topic", explode(split("topics", ",")))
open_questions_exploded = open_questions.withColumn("topic", explode(split("topics", ",")))

# Load the model globally
model = SentenceTransformer('all-MiniLM-L6-v2')
# UDF to generate embeddings
@udf(ArrayType(FloatType()))
def generate_embedding(text):
    return model.encode(text).tolist()
unique_topics_code = code_questions_exploded.select("topic").distinct()
unique_topics_open = open_questions_exploded.select("topic").distinct()
unique_topics_code = unique_topics_code.withColumn("topic_embedding", generate_embedding(col("topic")))
unique_topics_open = unique_topics_open.withColumn("topic_embedding", generate_embedding(col("topic")))

code_questions_exploded_df = code_questions_exploded.toPandas()
open_questions_exploded_df = open_questions_exploded.toPandas()
unique_topics_code_df = unique_topics_code.toPandas()
unique_topics_open_df = unique_topics_open.toPandas()

os.makedirs(MID_CALC_PATH, exist_ok=True)
code_questions_exploded_df.to_csv(os.path.join(MID_CALC_PATH, "code_questions_exploded.csv"), index=False)
open_questions_exploded_df.to_csv(os.path.join(MID_CALC_PATH, "open_questions_exploded.csv"), index=False)
unique_topics_code_df.to_csv(os.path.join(MID_CALC_PATH, "unique_topics_code.csv"), index=False)
unique_topics_open_df.to_csv(os.path.join(MID_CALC_PATH, "unique_topics_open.csv"), index=False)

In [0]:
from pyspark.sql.functions import rand
from calculate_heuristic_score import calculate_score

jobs_sample = open_csv_file(spark, JOBS_PATH, "jobs_sample.csv")
code_scores, open_scores = calculate_score(jobs_sample, spark)

In [0]:
code_scores.select("topics", "skills", "similarity").limit(70).display()
open_scores.select("topics", "skills", "similarity").limit(70).display()

In [0]:
from consts import MID_CALC_PATH

code_scores_df = code_scores.select("similarity").toPandas()
code_scores_df.to_csv(os.path.join(MID_CALC_PATH, "code_questions_similarity.csv"), index=False)
open_scores_df = open_scores.select("similarity").toPandas()
open_scores_df.to_csv(os.path.join(MID_CALC_PATH, "open_questions_similarity.csv"), index=False)

In [0]:
from pyspark.sql.functions import col, when, expr

""" Distributing the scores, so they'd be further apart """

# Define the transformation function
def apply_root_transform(df, col_name):
    return df.withColumn(
        col_name,
        when(col(col_name) >= 0, col(col_name) ** 0.5)  # Apply x^0.5 for positive values
        .otherwise(-(-col(col_name)) ** 0.5)           # Apply -(-x)^0.5 for negative values
    )

# Apply the transformation on the "similarity" column for both datasets
code_scores_after_transformation = apply_root_transform(code_scores, "similarity")
open_scores_after_transformation = apply_root_transform(open_scores, "similarity")

In [0]:
# Show transformed datasets (optional)
code_scores_after_transformation.select("topics", "skills", "similarity").limit(70).display()
open_scores_after_transformation.select("topics", "skills", "similarity").limit(70).display()

In [0]:
code_scores_after_transformation_df = code_scores_after_transformation.select("similarity").toPandas()
code_scores_after_transformation_df.to_csv(os.path.join(MID_CALC_PATH, "code_questions_transformed_similarity.csv"), index=False)
open_scores_after_transformation_df = open_scores_after_transformation.select("similarity").toPandas()
open_scores_after_transformation_df.to_csv(os.path.join(MID_CALC_PATH, "open_questions_transformed_similarity.csv"), index=False)

## Heuristic to match questions to jobs
Questions with the highest hueristic grades wil be the most likely to appear in the interview.

In [0]:
from pyspark.sql.functions import col, lit, udf, abs, coalesce, when
from pyspark.sql.types import FloatType
from scipy.spatial.distance import cosine

# Map difficulty levels to numeric values
difficulty_map = {"Easy": 0, "Medium": 1, "Hard": 2}

# Change difficulty column to numeric
code_with_heuristic = code_scores_after_transformation.withColumn(
    "difficulty",
    when(col("difficulty") == "Easy", difficulty_map["Easy"])
    .when(col("difficulty") == "Medium", difficulty_map["Medium"])
    .when(col("difficulty") == "Hard", difficulty_map["Hard"])
).cache()

# Match question's difficulty to job posting's level
code_with_heuristic = code_with_heuristic.withColumn(
    "difficulty_match",
    1 - abs(col("difficulty") - col("level")) / 2) \
    .withColumn("difficulty_match", coalesce(col("difficulty_match"), lit(0.5))) \
    .drop("difficulty").cache()
    
open_with_heuristic = open_scores_after_transformation.withColumn(
    "difficulty_match",
    lit(0.5)
).drop("difficulty").cache()

# Normalize Acceptance for code questions
max_acceptance = code_with_heuristic.agg({"acceptance": "max"}).collect()[0][0]
code_with_heuristic = code_with_heuristic.withColumn(
    "normalized_acceptance", col("acceptance") / max_acceptance
).drop("acceptance").cache()
open_with_heuristic = open_with_heuristic.withColumn("normalized_acceptance", lit(0.5)).cache()

# Calculate Heuristic Score
def calculate_score(difficulty, similarity, acceptance):
    return 0.3 * difficulty + 0.5 * similarity + 0.2 * acceptance
calculate_score_udf = udf(calculate_score, FloatType())

mean_difficulty_match = code_with_heuristic.agg({"difficulty_match": "mean"}).collect()[0][0]
mean_similarity = open_with_heuristic.agg({"similarity": "mean"}).collect()[0][0]
mean_acceptance = code_with_heuristic.agg({"normalized_acceptance": "mean"}).collect()[0][0]

code_with_heuristic = code_with_heuristic.withColumn("difficulty_match", coalesce(col("difficulty_match"), lit(mean_difficulty_match))) \
    .withColumn("similarity", coalesce(col("similarity"), lit(mean_similarity))) \
    .withColumn("normalized_acceptance", coalesce(col("normalized_acceptance"), lit(mean_acceptance))) \
    .withColumn("heuristic_score",
    calculate_score_udf(
        col("difficulty_match"),
        col("similarity"),
        col("normalized_acceptance"),
    ),
).drop("difficulty_match", "similarity", "normalized_acceptance").cache()
display(code_with_heuristic.head(70))

mean_similarity = open_with_heuristic.agg({"similarity": "mean"}).collect()[0][0]
open_with_heuristic = open_with_heuristic.withColumn("similarity", coalesce(col("similarity"), lit(mean_similarity))) \
    .withColumn("heuristic_score",
    calculate_score_udf(
        col("difficulty_match"),
        col("similarity"),
        col("normalized_acceptance"),
    ),
).drop("difficulty_match", "similarity", "normalized_acceptance").cache()
display(open_with_heuristic.head(70))

In [0]:
# Select Top Questions for Each Job
from pyspark.sql import Window
from pyspark.sql.functions import row_number
import pandas as pd

job_cols = pd.read_csv(os.path.join(JOBS_PATH, "jobs_sample.csv"), header=0).columns
window_spec = Window.partitionBy(*job_cols).orderBy(col("heuristic_score").desc())

top_code_questions = code_with_heuristic.withColumn(
    "rank", row_number().over(window_spec)
).filter(col("rank") <= 20)

top_open_questions = open_with_heuristic.withColumn(
    "rank", row_number().over(window_spec)
).filter(col("rank") <= 20)

In [0]:
from consts import DATA_PATH
top_code_questions_df = top_code_questions.toPandas()
top_code_questions_df.to_csv(os.path.join(DATA_PATH, "top_code_questions.csv"), index=False)
top_open_questions_df = top_open_questions.toPandas()
top_open_questions_df.to_csv(os.path.join(DATA_PATH, "top_open_questions.csv"), index=False)