In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, StringType
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np
import os
from consts import QUESTIONS_PATH, JOBS_PATH, open_csv_file

# Initialize Spark session
spark = SparkSession.builder.appName("InterviewQuestionSelector").getOrCreate()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [0]:
# Load datasets into Spark DataFrames
job_postings = open_csv_file(spark, JOBS_PATH, 'all_jobpostings.csv')
code_questions = open_csv_file(spark, QUESTIONS_PATH, 'all_code_problems_with_solutions.csv')
open_questions = open_csv_file(spark, QUESTIONS_PATH, 'all_open_questions.csv')

# Preprocessing function to handle missing values and ensure string type
def preprocess_column_spark(df, column):
    df = df.withColumn(column, col(column).cast("string"))
    df = df.fillna({column: ""})
    return df

# Preprocess columns in the datasets
job_postings = preprocess_column_spark(job_postings, 'job_summary')
code_questions = preprocess_column_spark(code_questions, 'topics')
open_questions = preprocess_column_spark(open_questions, 'question')

In [0]:
pip install -q -U google-generativeai

In [0]:
# import os
# import google.generativeai as genai

# # Set the API key securely
# os.environ['GOOGLE_API_KEY'] = 'AIzaSyDRhW3zsC_9C6JfevPqLb88QGWjy21Zf4c'

# genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# # List available models
# for m in genai.list_models():
#   if 'generateContent' in m.supported_generation_methods:
#     print(m.name)

# # Create model and generate content
# model = genai.GenerativeModel('gemini-1.5-flash')
# response = model.generate_content("What is the meaning of life?")
# print(response.text)

In [0]:
from pyspark.sql.functions import udf, concat_ws
from pyspark.sql.types import ArrayType, StringType
import ast

# Define a UDF to safely parse the string to a list
def parse_skills(skills_str):
    try:
        return ast.literal_eval(skills_str)
    except (ValueError, SyntaxError):
        return []

parse_skills_udf = udf(parse_skills, ArrayType(StringType()))

# Apply the UDF to create a proper list column
job_postings = job_postings.withColumn("skills_list", parse_skills_udf("skills"))

# Convert the skills list to a single string
job_postings = job_postings.withColumn("skills_string", concat_ws(", ", "skills_list")) \
    .drop("skills", "skills_list").withColumnRenamed("skills_string", "skills")
job_postings.display()

In [0]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import StringType
import google.generativeai as genai
import os

# Configure Gemini API
os.environ['GOOGLE_API_KEY'] = 'AIzaSyDRhW3zsC_9C6JfevPqLb88QGWjy21Zf4c'

def init_genai():
    """Initialize the Gemini API client."""
    genai.configure(api_key='AIzaSyDRhW3zsC_9C6JfevPqLb88QGWjy21Zf4c')
    return genai.GenerativeModel('gemini-1.5-flash')

def infer_skills_partition(rows):
    print("Starting partition processing...")
    model = init_genai()
    results = []
    
    for row in rows:
        print(f"Processing row: {row}")  # Debug input row
        job_summary = row.job_summary if row.job_summary else ""
        if not job_summary.strip():
            results.append((row.job_summary, row.skills))
            continue
        
        try:
            prompt = f"Extract a comma-separated list of technical, professional, and soft skills required for this job description: {job_summary}"
            response = model.generate_content(prompt)
            # Ensure response content exists
            extracted_skills = response.candidates[0].content.parts[0].text.strip()
            print(f"Extracted skills: {extracted_skills}")  # Debug response
            results.append((row.job_summary, extracted_skills))
        except Exception as e:
            print(f"Error processing row: {e}")
            results.append((row.job_summary, row.skills))  # Return existing skills on error
    
    return iter(results)


# Apply to DataFrame
job_postings_sample = job_postings.limit(10)
job_postings_sample_rdd = job_postings_sample.rdd.mapPartitions(infer_skills_partition)
job_postings_with_skills_sample = spark.createDataFrame(
    job_postings_sample_rdd, schema=['job_summary', 'skills']
)
job_postings_with_skills_sample.show()


In [0]:
job_postings_with_skills.display()

In [0]:
import os
print(os.environ.get('GOOGLE_API_KEY'))

In [0]:
# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings and merge with the DataFrame
def generate_embeddings(df, column, model):
    # Collect the column data as a list
    rows = df.select(column).rdd.map(lambda row: row[column]).collect()
    
    # Generate embeddings using the model
    embeddings = model.encode(rows, batch_size=32, show_progress_bar=True)
    
    # Add embeddings as a new column in the DataFrame
    embeddings_df = spark.createDataFrame(
        [(row, embedding.tolist()) for row, embedding in zip(rows, embeddings)],
        schema=StructType([
            StructField(column, StringType(), True),
            StructField(f"{column}_embedding", ArrayType(FloatType()), True)
        ])
    )
    
    # Join the embeddings DataFrame back with the original DataFrame
    return df.join(embeddings_df, column)

# Generate embeddings for job summaries, code questions, and open questions
job_postings = generate_embeddings(job_postings, 'job_summary', model)
job_postings = generate_embeddings(job_postings, 'skills', model)
code_questions = generate_embeddings(code_questions, 'topics', model)
open_questions = generate_embeddings(open_questions, 'question', model)

In [0]:
display(job_postings)
display(code_questions)
display(open_questions)

In [0]:
# Combine job embeddings (job_summary and skills) for topic relevance
job_postings = job_postings.withColumn(
    "jobposting_embedding",
    col("job_summary_embedding") + col("skills_embedding")
)

In [0]:
from pyspark.sql.functions import col, udf
from scipy.spatial.distance import cosine
from pyspark.sql.types import FloatType, ArrayType

# UDF for cosine similarity
cosine_similarity_udf = udf(
    lambda e1, e2: 1 - cosine(e1, e2) if e1 and e2 else 0,  # Handle null cases
    FloatType()
)

# Compute similarity-based weights
job_postings = job_postings.withColumn(
    "attention_weight",
    cosine_similarity_udf(col("job_summary_embeddings"), col("skills_embeddings"))
)

# UDF for weighted combination using attention weights
def attention_weighted_average(e1, e2, weight):
    return [
        (1 - weight) * e1_val + weight * e2_val
        for e1_val, e2_val in zip(e1, e2)
    ] if e1 and e2 else None  # Handle null cases

attention_weighted_avg_udf = udf(attention_weighted_average, ArrayType(FloatType()))

# Apply weighted averaging
job_postings = job_postings.withColumn(
    "job_embedding",
    attention_weighted_avg_udf(
        col("job_summary_embeddings"),
        col("skills_embeddings"),
        col("attention_weight")
    )
)

# Drop the intermediate attention_weight column if no longer needed
job_postings = job_postings.drop("attention_weight")


In [0]:
from pyspark.sql.functions import col, lit, udf
from pyspark.sql.types import FloatType
from scipy.spatial.distance import cosine

# Each question will be initially considered for each job posting.
jobs_with_code_questions = job_postings.crossJoin(code_questions)
jobs_with_open_questions = job_postings.crossJoin(open_questions)

# Map difficulty levels to numeric values
difficulty_map = {"Easy": 0, "Medium": 1, "Hard": 2}

# Change difficulty column to numeric
code_questions = code_questions.withColumn(
    "difficulty", col("difficulty").map(difficulty_map)
)

# Match question's difficulty to job posting's level
jobs_with_code_questions = jobs_with_code_questions.withColumn(
    "difficulty_match",
    1 - abs(col("difficulty") - col("level")) / 2
)

# Calculate cosine similarity between embeddings
def calculate_similarity(embedding1, embedding2):
    if embedding1 is None or embedding2 is None:
        return 0.0
    return 1 - cosine(embedding1, embedding2)
similarity_udf = udf(calculate_similarity, FloatType())

# Add topic similarity scores
jobs_with_code_questions = jobs_with_code_questions.withColumn(
    "emb_similarity",
    similarity_udf(col("jobposting_embedding", "topics_embedding"))
)
jobs_with_open_questions = jobs_with_open_questions.withColumn(
    "emb_similarity",
    similarity_udf(col("jobposting_embedding", "question_embedding"))
)

# Normalize Acceptance for code questions
max_acceptance = code_questions.agg({"acceptance": "max"}).collect()[0][0]
jobs_with_code_questions = jobs_with_code_questions.withColumn(
    "normalized_acceptance", col("acceptance") / max_acceptance
)

# Calculate Heuristic Score
def calculate_score(difficulty, similarity, acceptance):
    return 0.3 * difficulty + 0.5 * similarity + 0.2 * acceptance
calculate_score_udf = udf(calculate_score, FloatType())

jobs_with_code_questions = jobs_with_code_questions.withColumn(
    "heuristic_score",
    calculate_score_udf(
        col("difficulty_match"),
        col("emb_similarity"),
        col("normalized_acceptance"),
    ),
)
display(jobs_with_code_questions)

jobs_with_open_questions = jobs_with_open_questions.withColumn(
    "heuristic_score",
    calculate_score_udf(
        col("difficulty_match"),  # This is 0 for open questions
        col("emb_similarity"),
        lit(0)  # No acceptance column in open questions
    ),
)
display(jobs_with_open_questions)

In [0]:
# Select Top Questions for Each Job
from pyspark.sql import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy("job_id").orderBy(col("heuristic_score").desc())

top_code_questions = jobs_with_code_questions.withColumn(
    "rank", row_number().over(window_spec)
).filter(col("rank") <= 10)

top_open_questions = jobs_with_open_questions.withColumn(
    "rank", row_number().over(window_spec)
).filter(col("rank") <= 10)

display(top_code_questions)
display(top_open_questions)