In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, StringType
from scipy.spatial.distance import cosine
import numpy as np
import os
from consts import QUESTIONS_PATH, JOBS_PATH, open_csv_file

# Initialize Spark session
spark = SparkSession.builder.appName("InterviewQuestionSelector").getOrCreate()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [0]:
# Load datasets into Spark DataFrames
job_postings = open_csv_file(spark, JOBS_PATH, 'all_jobpostings.csv')
code_questions = open_csv_file(spark, QUESTIONS_PATH, 'all_code_problems_with_solutions.csv')
open_questions = open_csv_file(spark, QUESTIONS_PATH, 'all_open_questions.csv')

# Preprocessing function to handle missing values and ensure string type
def preprocess_column_spark(df, column):
    df = df.withColumn(column, col(column).cast("string"))
    df = df.fillna({column: ""})
    return df

# Preprocess columns in the datasets
job_postings = preprocess_column_spark(job_postings, 'job_summary')
code_questions = preprocess_column_spark(code_questions, 'topics')
open_questions = preprocess_column_spark(open_questions, 'question')

In [0]:
pip install -q -U google-generativeai

## Job postings: filling in missing skills

In [0]:
from pyspark.sql.functions import udf, concat_ws
from pyspark.sql.types import ArrayType, StringType
import ast

# Define a UDF to safely parse the string to a list
def parse_skills(skills_str):
    try:
        return ast.literal_eval(skills_str)
    except (ValueError, SyntaxError):
        return []

parse_skills_udf = udf(parse_skills, ArrayType(StringType()))

# Apply the UDF to create a proper list column
job_postings = job_postings.withColumn("skills_list", parse_skills_udf("skills"))

# Convert the skills list to a single string
job_postings = job_postings.withColumn("skills_string", concat_ws(", ", "skills_list")) \
    .drop("skills", "skills_list").withColumnRenamed("skills_string", "skills")

In [0]:
import pandas as pd

# Convert dataset to pandas
job_postings_pandas = job_postings.toPandas()
empty_skills_count = job_postings_pandas[job_postings_pandas['skills'] == ''].shape[0]
print(empty_skills_count)

9551


In [0]:
import google.generativeai as genai
import os
import time
from api_keys import API_KEYS

def infer_skills(job_summary):
    """
    Extracts skills from a job description using the Gemini model.

    Args:
        job_summary: The job description text.

    Returns:
        A comma-separated string of skills extracted from the job description.
    """

    if pd.isna(job_summary) or str(job_summary).strip() == '':
        return ''

    prompt = f"Infer a comma-separated list of skills required for the following job description:\n{job_summary}"

    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        return ''
    
start_time = time.time()
running_time = 0
empty_skills_rows = job_postings_pandas[(job_postings_pandas['skills'] == '') & (job_postings_pandas['job_summary'].str.strip() != '')]

while (running_time < 3600) and (empty_skills_rows.shape[0] > 0):
    for api_key in API_KEYS.values():
        # Configure Gemini API
        os.environ['GOOGLE_API_KEY'] = api_key
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
        model = genai.GenerativeModel('gemini-1.5-flash')

        # Filter the rows where the "skills" column is empty
        empty_skills_rows = job_postings_pandas[(job_postings_pandas['skills'] == '') & (job_postings_pandas['job_summary'].str.strip() != '')]
        if empty_skills_rows.shape[0] == 0:
            break

        # Get the indices of the first 15 rows with empty "skills"
        indices_to_update = empty_skills_rows.index[:15]

        # Apply the UDF only to the selected rows
        job_postings_pandas.loc[indices_to_update, 'skills'] = (
            job_postings_pandas.loc[indices_to_update, 'job_summary']
                .apply(infer_skills)
        )
    running_time = time.time() - start_time

In [0]:
job_postings_pandas['skills'] = job_postings_pandas['skills'].fillna('')
empty_skills_count = job_postings_pandas[job_postings_pandas['skills'] == ''].shape[0]
job_postings_pandas['job_summary'] = job_postings_pandas['job_summary'].fillna('')
empty_job_summaries = job_postings_pandas[job_postings_pandas['job_summary'].str.strip() == ''].shape[0]
print("empty strings:", empty_skills_count)
print("empty job summaries:", empty_job_summaries)

empty strings: 631
empty job summaries: 631


In [0]:
from consts import JOBS_PATH
import os

job_postings_with_skills = spark.createDataFrame(job_postings_pandas)
job_postings_pandas.to_csv(os.path.join(JOBS_PATH, 'all_jobpostings_with_skills.csv'), index=False)

## Code questions: filling in missing topics

In [0]:
from pyspark.sql.functions import regexp_replace, concat_ws, split, col, expr

# Format the topics column to be a string containing comma-separated topics.
code_questions = code_questions.withColumn("topics_array", split(col("topics"), ", ")) \
    .withColumn("topics_array_cleaned", expr("transform(topics_array, x -> regexp_replace(x, \"'\", \"\"))")) \
    .withColumn("topics_formatted", concat_ws(", ", col("topics_array_cleaned"))) \
    .drop("topics_array", "topics_array_cleaned", "topics").withColumnRenamed("topics_formatted", "topics")

code_questions_pandas = code_questions.toPandas()

In [0]:
import google.generativeai as genai
import os
import time
from api_keys import API_KEYS

def extract_topics_from_question(question):
   if pd.isna(question) or question.strip() == '':
       return ''
   
   prompt = f"Analyze the following question and identify the specific skills being tested or evaluated. Return the skills as a comma-separated list of skills. If the question does not test any skills, return an empty string. Question: {question}"
   
   try:
       response = model.generate_content(prompt)
       skills = response.text.strip()
       return skills if skills else ''
   except Exception as e:
       return ''
   

start_time = time.time()
running_time = 0
empty_topics_rows = code_questions_pandas[(code_questions_pandas['topics'] == '') & (code_questions_pandas['question'].str.strip() != '')]

while (running_time < 900) and (empty_topics_rows.shape[0] > 0):
    for api_key in API_KEYS.values():
        # Configure Gemini API
        os.environ['GOOGLE_API_KEY'] = api_key
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
        model = genai.GenerativeModel('gemini-1.5-flash')

        # Filter the rows where the "topics" column is empty
        empty_topics_rows = code_questions_pandas[(code_questions_pandas['topics'] == '') & (code_questions_pandas['question'].str.strip() != '')]
        if empty_topics_rows.shape[0] == 0:
            break

        # Get the indices of the first 15 rows with empty "topics"
        indices_to_update = empty_topics_rows.index[:15]

        # Apply the UDF only to the selected rows
        code_questions_pandas.loc[indices_to_update, 'topics'] = (
            code_questions_pandas.loc[indices_to_update, 'question']
                .apply(extract_topics_from_question)
        )
    running_time = time.time() - start_time

In [0]:
empty_topics_count = code_questions_pandas[code_questions_pandas['topics'].isna()].shape[0]
print("nulls:", empty_topics_count)
code_questions_pandas['topics'] = code_questions_pandas['topics'].fillna('')
empty_topics_count = code_questions_pandas[code_questions_pandas['topics'] == ''].shape[0]
code_questions_pandas['question'] = code_questions_pandas['question'].fillna('')
empty_questions = code_questions_pandas[code_questions_pandas['question'].str.strip() == ''].shape[0]
print("empty strings:", empty_topics_count)
print("empty questions:", empty_questions)

nulls: 0
empty strings: 0
empty questions: 0


In [0]:
from consts import QUESTIONS_PATH
import os

code_questions_with_topics = spark.createDataFrame(code_questions_pandas)
code_questions_pandas.to_csv(os.path.join(QUESTIONS_PATH, 'all_code_questions_with_topics.csv'), index=False)

In [None]:
code_questions_with_topics.limit(70).display()

Unnamed: 0,formatted_title,difficulty,question_id,question,similar_questions,no_similar_questions,acceptance,solution_URL,solution,topics
0,longest-substring-without-repeating-characters,Medium,3,"Given a string s, find the length of the longe...","[""'Longest Substring with At Most Two Distinct...",9.0,34.1,https://leetcode.com/problems/longest-substrin...,,"Hash Table, String, Sliding Window"
1,median-of-two-sorted-arrays,Hard,4,Given two sorted arrays nums1 and nums2 of siz...,,,,,,"Algorithm design,Time complexity analysis,Bina..."
2,longest-palindromic-substring,Medium,5,"Given a string s, return the longest palindrom...","[""'Shortest Palindrome'"", ""'Palindrome Permuta...",6.0,33.2,https://leetcode.com/problems/longest-palindro...,package com.fishercoder.solutions.firstthousan...,"String, Dynamic Programming"
3,reverse-integer,Medium,7,"Given a signed 32-bit integer x , return x wit...",,,,,,"Mathematical skills, Programming skills, Algor..."
4,reverse-integer,Medium,7,"Given a signed 32-bit integer x, return x with...","[""'String to Integer (atoi)'"", ""'Reverse Bits'...",4.0,27.9,https://leetcode.com/problems/reverse-integer/...,,Math
...,...,...,...,...,...,...,...,...,...,...
65,maximum-depth-of-binary-tree,Easy,104,"Given the root of a binary tree, return its ma...","[""'Balanced Binary Tree'"", ""'Minimum Depth of ...",6.0,74.6,https://leetcode.com/problems/maximum-depth-of...,package com.fishercoder.solutions.firstthousan...,"Tree, Depth-First Search, Breadth-First Search..."
66,construct-binary-tree-from-preorder-and-inorde...,Medium,105,Given two integer arrays preorder and inorder ...,,,,,package com.fishercoder.solutions.firstthousan...,"Tree construction, recursion, array manipulati..."
67,convert-sorted-array-to-binary-search-tree,Easy,108,Given an integer array nums where the elements...,"[""'Convert Sorted List to Binary Search Tree'""]",1.0,70.8,https://leetcode.com/problems/convert-sorted-a...,,"Array, Divide and Conquer, Tree, Binary Search..."
68,path-sum-ii,Medium,113,Given the root of a binary tree and an integer...,,,,,package com.fishercoder.solutions.firstthousan...,"Tree traversal,Depth-First Search,Recursion,Pr..."


## Open questions: filling in missing topics

In [0]:
import pandas as pd

# Convert dataset to pandas
open_questions_pandas = open_questions.toPandas()
open_questions_pandas['topics'] = ''

In [0]:
import google.generativeai as genai
import os
from api_keys import API_KEYS
   
start_time = time.time()
running_time = 0
empty_topics_rows = open_questions_pandas[(open_questions_pandas['topics'] == '') & (open_questions_pandas['question'].str.strip() != '')]

while (running_time < 900) and (empty_topics_rows.shape[0] > 0):
    for api_key in API_KEYS.values():
        # Configure Gemini API
        os.environ['GOOGLE_API_KEY'] = api_key
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
        model = genai.GenerativeModel('gemini-1.5-flash')

        empty_topics_rows = open_questions_pandas[(open_questions_pandas['topics'] == '') & (open_questions_pandas['question'].str.strip() != '')]
        if empty_topics_rows.shape[0] == 0:
            break

        # Get the indices of the first 15 rows with empty "topics"
        indices_to_update = empty_topics_rows.index[:15]

        # Apply the UDF only to the selected rows
        open_questions_pandas.loc[indices_to_update, 'topics'] = (
            open_questions_pandas.loc[indices_to_update, 'question']
                .apply(extract_topics_from_question)
        )
    running_time = time.time() - start_time

In [0]:
from consts import QUESTIONS_PATH
import os

open_questions_with_topics = spark.createDataFrame(open_questions_pandas)
open_questions_pandas.to_csv(os.path.join(QUESTIONS_PATH, 'all_open_questions_with_topics.csv'), index=False)

0


In [5]:

open_questions_with_topics.limit(70).display()

Unnamed: 0,question_id,question,category,topics
0,112,What are your strengths?,General,"Self-awareness, communication, self-promotion"
1,113,What are your weaknesses?,General,"Self-awareness, honesty, self-reflection, comm..."
2,114,Why are you interested in working for [insert ...,General,"Research skills, Communication skills, Persuas..."
3,115,Where do you see yourself in five years? Ten y...,General,"Career planning, foresight, self-awareness, lo..."
4,116,Why do you want to leave your current company?,General,"Communication skills, self-awareness, critical..."
...,...,...,...,...
65,16,What makes CNNs translation invariant,Data Science,Understanding of Convolutional Neural Networks...
66,17,How is fastText different from wordvec,Data Science,"Knowledge of word embeddings, understanding of..."
67,18,Explain Generative Adversarial Network,Data Science,"Knowledge of GANs, Explanation skills, Under..."
68,19,What is backward and forward propagation,Data Science,"Understanding of neural networks,Knowledge of ..."


# Demonstrating the model on 50 random jobs

In [0]:
import pandas as pd
import os
from consts import JOBS_PATH, DATA_PATH

# Load the CSV file
jobs_data = pd.read_csv(os.path.join(JOBS_PATH, "all_jobpostings_with_skills.csv"))

# Perform train-test split (e.g., 80-20 split)
jobs_sample = jobs_data.sample(n=50, random_state=42)

# Save the train and test sets to separate files
jobs_sample.to_csv(os.path.join(JOBS_PATH, "jobs_sample.csv"), index=False)

## Topics & skills embeddings

In [0]:
pip install sentence-transformers

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col, lit, udf, array, broadcast
from pyspark.sql.types import ArrayType, FloatType, DoubleType
from sentence_transformers import SentenceTransformer
from calculate_heuristic_score import calculate_score
from consts import DATA_PATH, QUESTIONS_PATH, MID_CALC_PATH, open_csv_file

jobs_sample = open_csv_file(spark, JOBS_PATH, "jobs_sample.csv")
code_questions = open_csv_file(spark, QUESTIONS_PATH, "all_code_questions_with_topics.csv")
open_questions = open_csv_file(spark, QUESTIONS_PATH, "all_open_questions_with_topics.csv")

code_questions_exploded = code_questions.withColumn("topic", explode(split("topics", ",")))
open_questions_exploded = open_questions.withColumn("topic", explode(split("topics", ",")))

# Load the model globally
model = SentenceTransformer('all-MiniLM-L6-v2')
# UDF to generate embeddings
@udf(ArrayType(FloatType()))
def generate_embedding(text):
    return model.encode(text).tolist()
unique_topics_code = code_questions_exploded.select("topic").distinct()
unique_topics_open = open_questions_exploded.select("topic").distinct()
unique_topics_code = unique_topics_code.withColumn("topic_embedding", generate_embedding(col("topic")))
unique_topics_open = unique_topics_open.withColumn("topic_embedding", generate_embedding(col("topic")))

code_questions_exploded_df = code_questions_exploded.toPandas()
open_questions_exploded_df = open_questions_exploded.toPandas()
unique_topics_code_df = unique_topics_code.toPandas()
unique_topics_open_df = unique_topics_open.toPandas()

os.makedirs(MID_CALC_PATH, exist_ok=True)
code_questions_exploded_df.to_csv(os.path.join(MID_CALC_PATH, "code_questions_exploded.csv"), index=False)
open_questions_exploded_df.to_csv(os.path.join(MID_CALC_PATH, "open_questions_exploded.csv"), index=False)
unique_topics_code_df.to_csv(os.path.join(MID_CALC_PATH, "unique_topics_code.csv"), index=False)
unique_topics_open_df.to_csv(os.path.join(MID_CALC_PATH, "unique_topics_open.csv"), index=False)

In [0]:
from pyspark.sql.functions import rand
from calculate_heuristic_score import calculate_score

jobs_sample = open_csv_file(spark, JOBS_PATH, "jobs_sample.csv")
code_scores, open_scores = calculate_score(jobs_sample, spark)

In [0]:
code_scores.select("topics", "skills", "similarity").limit(70).display()
open_scores.select("topics", "skills", "similarity").limit(70).display()

topics,skills,similarity
"Array, Two Pointers, Simulation","Linux System Administration, DevOps, Cloud Engineering, Automation, Bash Scripting, SSH, Git, Jenkins, Consul, Vault, Networking, TCP/IP, DNS, HTTP, TLS, Web Security, Troubleshooting, Monitoring (Nagios, Cacti, Grafana, ELK), HAProxy, Kubernetes, Infrastructure as Code (IaC), Configuration Management, CI/CD, Containerization, Virtualization, MySQL, Backup and Restore, Documentation, Communication, Problem-solving, Software Updates, High Availability (HA), Security, CentOS, Ubuntu, Python/other interpreted languages (Preferred), Email Server Administration (Preferred)",0.1524822603212669
"Array, Matrix, Simulation","Linux System Administration, DevOps, Cloud Engineering, Automation, Bash Scripting, SSH, Git, Jenkins, Consul, Vault, Networking, TCP/IP, DNS, HTTP, TLS, Web Security, Troubleshooting, Monitoring (Nagios, Cacti, Grafana, ELK), HAProxy, Kubernetes, Infrastructure as Code (IaC), Configuration Management, CI/CD, Containerization, Virtualization, MySQL, Backup and Restore, Documentation, Communication, Problem-solving, Software Updates, High Availability (HA), Security, CentOS, Ubuntu, Python/other interpreted languages (Preferred), Email Server Administration (Preferred)",0.1581306618715947
"String, Simulation","Linux System Administration, DevOps, Cloud Engineering, Automation, Bash Scripting, SSH, Git, Jenkins, Consul, Vault, Networking, TCP/IP, DNS, HTTP, TLS, Web Security, Troubleshooting, Monitoring (Nagios, Cacti, Grafana, ELK), HAProxy, Kubernetes, Infrastructure as Code (IaC), Configuration Management, CI/CD, Containerization, Virtualization, MySQL, Backup and Restore, Documentation, Communication, Problem-solving, Software Updates, High Availability (HA), Security, CentOS, Ubuntu, Python/other interpreted languages (Preferred), Email Server Administration (Preferred)",0.1694836907554417
"String manipulation, substring identification, iterative processing, algorithm design","Licensed Occupational Therapist, Physical Therapy Assistant license (preferred), Occupational Therapy Assistant certification (preferred)",0.0391619024643053
"Array, Depth-First Search, Breadth-First Search, Union Find, Matrix","Agile methodologies, Scrum, Software Engineering, Computer Science, Electrical Engineering, REST API, JSON/XML, Open Source Technologies (NiFi, Kafka, Elastic Stack, Solr), CI/CD, Ansible, Jenkins, Git, Java, Docker, Kubernetes, JIRA, Confluence, Linux, Windows, Model Based Systems Engineering (MBSE), SecDevOps, User Interface Development, User Experience, Web Application Development, System Integration, Software Design, Problem Solving, Continuous Integration",0.1178168364650299
"Tree, Depth-First Search, Binary Tree","Agile methodologies, Scrum, Software Engineering, Computer Science, Electrical Engineering, REST API, JSON/XML, Open Source Technologies (NiFi, Kafka, Elastic Stack, Solr), CI/CD, Ansible, Jenkins, Git, Java, Docker, Kubernetes, JIRA, Confluence, Linux, Windows, Model Based Systems Engineering (MBSE), SecDevOps, User Interface Development, User Experience, Web Application Development, System Integration, Software Design, Problem Solving, Continuous Integration",0.1546903834628161
"Array, Hash Table, Tree, Depth-First Search, Breadth-First Search, Binary Tree","Agile methodologies, Scrum, Software Engineering, Computer Science, Electrical Engineering, REST API, JSON/XML, Open Source Technologies (NiFi, Kafka, Elastic Stack, Solr), CI/CD, Ansible, Jenkins, Git, Java, Docker, Kubernetes, JIRA, Confluence, Linux, Windows, Model Based Systems Engineering (MBSE), SecDevOps, User Interface Development, User Experience, Web Application Development, System Integration, Software Design, Problem Solving, Continuous Integration",0.1365442384855561
"Array, Dynamic Programming","Evaluation, Implementation, Execution, On-premise support, Field work",0.2504414746537804
"Array, Math, Greedy, Sorting","Evaluation, Implementation, Execution, On-premise support, Field work",0.2248905116692185
"Hash Table, Binary Search, Design, Sorting, Ordered Set","Statistical methods, Causal inference, Data analysis, Data visualization, Data interpretation, SQL, Python, R, Project management, Data storytelling, Communication, Stakeholder management, Teamwork, Mentorship, Product data analysis, Revenue data analysis",0.1905523349065333


topics,skills,similarity
"Conflict resolution, customer service, communication, empathy, problem-solving","Troubleshooting, Customer Support, Network Engineering, WAN Operations, TCP/IP, OSI Model, Linux/UNIX, VMware, Network Security, Windows OS, Communication (written & verbal - English & Japanese), Data Analysis, Problem-solving, KCS Methodology, Siebel (CRM), Public/Private Cloud Technologies, Scripting (e.g., Python, JavaScript), Containerization (e.g., Docker), Container Orchestration (e.g., Kubernetes), Automation (e.g., Ansible), Technical Documentation, Multitasking, Prioritization, Teamwork, ISO Quality Management Systems.",0.2223421096801758
"Understanding of convolutional neural networks, critical thinking, problem-solving","Statistical analysis, Data modeling, RFM analysis, Predictive modeling, Marketing mix modeling, Attribution modeling, A/B testing, Multivariate testing, Regression analysis, Cluster analysis, CHAID, Factor analysis, Principal component analysis, Time series analysis, Survival analysis, Experimental design, SQL, R, Python, Data visualization, Presentation skills, Communication skills, Project management, Collaboration, Data quality assurance (QA/QC), Database marketing",0.2268657122667019
"Problem-solving, critical thinking, understanding of machine learning concepts","C/C++, Python, Linux-based development, VLSI CAD algorithm development, data structures, algorithms, software engineering principles, strong verbal and written communication skills, strong teamwork skills, problem-solving skills, analysis skills, programming skills, debugging skills, troubleshooting skills, statistical analysis, machine learning, deep learning, routing algorithm knowledge, quality and software processes, Unix/Linux platform experience",0.2834614831954241
"Deep learning,Gradient descent optimization,Debugging,Problem-solving","Linux/UNIX administration, Docker, docker-compose, containerized development, Ansible, Packer, Terraform, OpenShift, Kubernetes, Bash, Python, Node.js, Application clustering, load balancing, VMware vSphere API, Gitlab CI/CD, networking fundamentals, command line tools, HTTP, SSL, LDAP, SQL, HTML, XML, PostgreSQL, Keepalived, CI/CD, Blue/Green Deployments, Consul, Atomic Host, distributed computing, data systems, immutable infrastructure, serverless computing",0.1466910544344607
"Mathematical reasoning,Computational complexity analysis,Algorithm understanding","Software development, Data structures, Algorithms, C, C++, Java, JavaScript, Python, C#, Go, Web application development, Mobile application development, Unix/Linux environments, Distributed systems, Parallel systems, Machine learning, Information retrieval, Natural language processing, Networking, Large software system development, Security software development, UI development, AJAX, Embedded systems, Mobile app development (Android/iOS), Developer tools, Automated test system development, Cloud-based computing, Problem-solving, Leadership, Communication (written and verbal English)",0.2089879236394359
"Understanding of deep learning concepts,Knowledge of gradient descent,Problem-solving","financial analysis, accounting, Microsoft Excel, data analysis, financial reporting, project management, account reconciliation, communication, organization, time management, flexibility, team collaboration, influencing, KPI tracking, cash flow management",0.253946195046107
"Deep learning,Gradient descent optimization,Debugging,Problem-solving","financial analysis, accounting, Microsoft Excel, data analysis, financial reporting, project management, account reconciliation, communication, organization, time management, flexibility, team collaboration, influencing, KPI tracking, cash flow management",0.253946195046107
"Problem-solving, critical thinking, understanding of machine learning concepts","Linux/UNIX administration, Docker, docker-compose, containerized development, Ansible, Packer, Terraform, OpenShift, Kubernetes, Bash, Python, Node.js, Application clustering, load balancing, VMware vSphere API, Gitlab CI/CD, networking fundamentals, command line tools, HTTP, SSL, LDAP, SQL, HTML, XML, PostgreSQL, Keepalived, CI/CD, Blue/Green Deployments, Consul, Atomic Host, distributed computing, data systems, immutable infrastructure, serverless computing",0.1466910544344607
"Understanding of neural networks,Understanding of activation functions,Problem-solving,Critical thinking","Linux/UNIX administration, Docker, docker-compose, containerized development, Ansible, Packer, Terraform, OpenShift, Kubernetes, Bash, Python, Node.js, Application clustering, load balancing, VMware vSphere API, Gitlab CI/CD, networking fundamentals, command line tools, HTTP, SSL, LDAP, SQL, HTML, XML, PostgreSQL, Keepalived, CI/CD, Blue/Green Deployments, Consul, Atomic Host, distributed computing, data systems, immutable infrastructure, serverless computing",0.1466910544344607
"Conflict resolution, customer service, communication, empathy, problem-solving","SQL, RDBMS, Non-RDBMS, MySQL, PostgreSQL, MongoDB, Python, Jupyter Notebook, Inferential Statistics, Probability, ETL, Data Pipeline, Automated Reporting, Data Analysis, Statistical Modeling, Machine Learning, Git, Hive, Spark, Presto, Diagnostic Analytics, Forecasting, Big Data",0.1990645072706367


In [0]:
from consts import MID_CALC_PATH

code_scores_df = code_scores.select("similarity").toPandas()
code_scores_df.to_csv(os.path.join(MID_CALC_PATH, "code_questions_similarity.csv"), index=False)
open_scores_df = open_scores.select("similarity").toPandas()
open_scores_df.to_csv(os.path.join(MID_CALC_PATH, "open_questions_similarity.csv"), index=False)

In [0]:
from pyspark.sql.functions import col, when, expr

""" Distributing the scores, so they'd be further apart """

# Define the transformation function
def apply_root_transform(df, col_name):
    return df.withColumn(
        col_name,
        when(col(col_name) >= 0, col(col_name) ** 0.5)  # Apply x^0.5 for positive values
        .otherwise(-(-col(col_name)) ** 0.5)           # Apply -(-x)^0.5 for negative values
    )

# Apply the transformation on the "similarity" column for both datasets
code_scores_after_transformation = apply_root_transform(code_scores, "similarity")
open_scores_after_transformation = apply_root_transform(open_scores, "similarity")

In [0]:
# Show transformed datasets (optional)
code_scores_after_transformation.select("topics", "skills", "similarity").limit(70).display()
open_scores_after_transformation.select("topics", "skills", "similarity").limit(70).display()

topics,skills,similarity
"Array, Two Pointers, Simulation","Linux System Administration, DevOps, Cloud Engineering, Automation, Bash Scripting, SSH, Git, Jenkins, Consul, Vault, Networking, TCP/IP, DNS, HTTP, TLS, Web Security, Troubleshooting, Monitoring (Nagios, Cacti, Grafana, ELK), HAProxy, Kubernetes, Infrastructure as Code (IaC), Configuration Management, CI/CD, Containerization, Virtualization, MySQL, Backup and Restore, Documentation, Communication, Problem-solving, Software Updates, High Availability (HA), Security, CentOS, Ubuntu, Python/other interpreted languages (Preferred), Email Server Administration (Preferred)",0.390489769803598
"Array, Matrix, Simulation","Linux System Administration, DevOps, Cloud Engineering, Automation, Bash Scripting, SSH, Git, Jenkins, Consul, Vault, Networking, TCP/IP, DNS, HTTP, TLS, Web Security, Troubleshooting, Monitoring (Nagios, Cacti, Grafana, ELK), HAProxy, Kubernetes, Infrastructure as Code (IaC), Configuration Management, CI/CD, Containerization, Virtualization, MySQL, Backup and Restore, Documentation, Communication, Problem-solving, Software Updates, High Availability (HA), Security, CentOS, Ubuntu, Python/other interpreted languages (Preferred), Email Server Administration (Preferred)",0.3976564621272923
"String, Simulation","Linux System Administration, DevOps, Cloud Engineering, Automation, Bash Scripting, SSH, Git, Jenkins, Consul, Vault, Networking, TCP/IP, DNS, HTTP, TLS, Web Security, Troubleshooting, Monitoring (Nagios, Cacti, Grafana, ELK), HAProxy, Kubernetes, Infrastructure as Code (IaC), Configuration Management, CI/CD, Containerization, Virtualization, MySQL, Backup and Restore, Documentation, Communication, Problem-solving, Software Updates, High Availability (HA), Security, CentOS, Ubuntu, Python/other interpreted languages (Preferred), Email Server Administration (Preferred)",0.4116839695147745
"String manipulation, substring identification, iterative processing, algorithm design","Licensed Occupational Therapist, Physical Therapy Assistant license (preferred), Occupational Therapy Assistant certification (preferred)",0.1978936645380679
"Array, Depth-First Search, Breadth-First Search, Union Find, Matrix","Agile methodologies, Scrum, Software Engineering, Computer Science, Electrical Engineering, REST API, JSON/XML, Open Source Technologies (NiFi, Kafka, Elastic Stack, Solr), CI/CD, Ansible, Jenkins, Git, Java, Docker, Kubernetes, JIRA, Confluence, Linux, Windows, Model Based Systems Engineering (MBSE), SecDevOps, User Interface Development, User Experience, Web Application Development, System Integration, Software Design, Problem Solving, Continuous Integration",0.3432445723751942
"Tree, Depth-First Search, Binary Tree","Agile methodologies, Scrum, Software Engineering, Computer Science, Electrical Engineering, REST API, JSON/XML, Open Source Technologies (NiFi, Kafka, Elastic Stack, Solr), CI/CD, Ansible, Jenkins, Git, Java, Docker, Kubernetes, JIRA, Confluence, Linux, Windows, Model Based Systems Engineering (MBSE), SecDevOps, User Interface Development, User Experience, Web Application Development, System Integration, Software Design, Problem Solving, Continuous Integration",0.3933069837452879
"Array, Hash Table, Tree, Depth-First Search, Breadth-First Search, Binary Tree","Agile methodologies, Scrum, Software Engineering, Computer Science, Electrical Engineering, REST API, JSON/XML, Open Source Technologies (NiFi, Kafka, Elastic Stack, Solr), CI/CD, Ansible, Jenkins, Git, Java, Docker, Kubernetes, JIRA, Confluence, Linux, Windows, Model Based Systems Engineering (MBSE), SecDevOps, User Interface Development, User Experience, Web Application Development, System Integration, Software Design, Problem Solving, Continuous Integration",0.3695189284536803
"Array, Dynamic Programming","Evaluation, Implementation, Execution, On-premise support, Field work",0.5004412799258074
"Array, Math, Greedy, Sorting","Evaluation, Implementation, Execution, On-premise support, Field work",0.4742262241475249
"Hash Table, Binary Search, Design, Sorting, Ordered Set","Statistical methods, Causal inference, Data analysis, Data visualization, Data interpretation, SQL, Python, R, Project management, Data storytelling, Communication, Stakeholder management, Teamwork, Mentorship, Product data analysis, Revenue data analysis",0.4365230061595074


topics,skills,similarity
"Conflict resolution, customer service, communication, empathy, problem-solving","Troubleshooting, Customer Support, Network Engineering, WAN Operations, TCP/IP, OSI Model, Linux/UNIX, VMware, Network Security, Windows OS, Communication (written & verbal - English & Japanese), Data Analysis, Problem-solving, KCS Methodology, Siebel (CRM), Public/Private Cloud Technologies, Scripting (e.g., Python, JavaScript), Containerization (e.g., Docker), Container Orchestration (e.g., Kubernetes), Automation (e.g., Ansible), Technical Documentation, Multitasking, Prioritization, Teamwork, ISO Quality Management Systems.",0.4715316634969234
"Understanding of convolutional neural networks, critical thinking, problem-solving","Statistical analysis, Data modeling, RFM analysis, Predictive modeling, Marketing mix modeling, Attribution modeling, A/B testing, Multivariate testing, Regression analysis, Cluster analysis, CHAID, Factor analysis, Principal component analysis, Time series analysis, Survival analysis, Experimental design, SQL, R, Python, Data visualization, Presentation skills, Communication skills, Project management, Collaboration, Data quality assurance (QA/QC), Database marketing",0.4763042223901672
"Problem-solving, critical thinking, understanding of machine learning concepts","C/C++, Python, Linux-based development, VLSI CAD algorithm development, data structures, algorithms, software engineering principles, strong verbal and written communication skills, strong teamwork skills, problem-solving skills, analysis skills, programming skills, debugging skills, troubleshooting skills, statistical analysis, machine learning, deep learning, routing algorithm knowledge, quality and software processes, Unix/Linux platform experience",0.5324110096489592
"Deep learning,Gradient descent optimization,Debugging,Problem-solving","Linux/UNIX administration, Docker, docker-compose, containerized development, Ansible, Packer, Terraform, OpenShift, Kubernetes, Bash, Python, Node.js, Application clustering, load balancing, VMware vSphere API, Gitlab CI/CD, networking fundamentals, command line tools, HTTP, SSL, LDAP, SQL, HTML, XML, PostgreSQL, Keepalived, CI/CD, Blue/Green Deployments, Consul, Atomic Host, distributed computing, data systems, immutable infrastructure, serverless computing",0.3830026820199315
"Mathematical reasoning,Computational complexity analysis,Algorithm understanding","Software development, Data structures, Algorithms, C, C++, Java, JavaScript, Python, C#, Go, Web application development, Mobile application development, Unix/Linux environments, Distributed systems, Parallel systems, Machine learning, Information retrieval, Natural language processing, Networking, Large software system development, Security software development, UI development, AJAX, Embedded systems, Mobile app development (Android/iOS), Developer tools, Automated test system development, Cloud-based computing, Problem-solving, Leadership, Communication (written and verbal English)",0.4571519699612328
"Understanding of deep learning concepts,Knowledge of gradient descent,Problem-solving","financial analysis, accounting, Microsoft Excel, data analysis, financial reporting, project management, account reconciliation, communication, organization, time management, flexibility, team collaboration, influencing, KPI tracking, cash flow management",0.5039307442953912
"Deep learning,Gradient descent optimization,Debugging,Problem-solving","financial analysis, accounting, Microsoft Excel, data analysis, financial reporting, project management, account reconciliation, communication, organization, time management, flexibility, team collaboration, influencing, KPI tracking, cash flow management",0.5039307442953912
"Problem-solving, critical thinking, understanding of machine learning concepts","Linux/UNIX administration, Docker, docker-compose, containerized development, Ansible, Packer, Terraform, OpenShift, Kubernetes, Bash, Python, Node.js, Application clustering, load balancing, VMware vSphere API, Gitlab CI/CD, networking fundamentals, command line tools, HTTP, SSL, LDAP, SQL, HTML, XML, PostgreSQL, Keepalived, CI/CD, Blue/Green Deployments, Consul, Atomic Host, distributed computing, data systems, immutable infrastructure, serverless computing",0.3830026820199315
"Understanding of neural networks,Understanding of activation functions,Problem-solving,Critical thinking","Linux/UNIX administration, Docker, docker-compose, containerized development, Ansible, Packer, Terraform, OpenShift, Kubernetes, Bash, Python, Node.js, Application clustering, load balancing, VMware vSphere API, Gitlab CI/CD, networking fundamentals, command line tools, HTTP, SSL, LDAP, SQL, HTML, XML, PostgreSQL, Keepalived, CI/CD, Blue/Green Deployments, Consul, Atomic Host, distributed computing, data systems, immutable infrastructure, serverless computing",0.3830026820199315
"Conflict resolution, customer service, communication, empathy, problem-solving","SQL, RDBMS, Non-RDBMS, MySQL, PostgreSQL, MongoDB, Python, Jupyter Notebook, Inferential Statistics, Probability, ETL, Data Pipeline, Automated Reporting, Data Analysis, Statistical Modeling, Machine Learning, Git, Hive, Spark, Presto, Diagnostic Analytics, Forecasting, Big Data",0.4461664569088949


In [0]:
code_scores_after_transformation_df = code_scores_after_transformation.select("similarity").toPandas()
code_scores_after_transformation_df.to_csv(os.path.join(MID_CALC_PATH, "code_questions_transformed_similarity.csv"), index=False)
open_scores_after_transformation_df = open_scores_after_transformation.select("similarity").toPandas()
open_scores_after_transformation_df.to_csv(os.path.join(MID_CALC_PATH, "open_questions_transformed_similarity.csv"), index=False)

## Heuristic to match questions to jobs
Questions with the highest hueristic grades wil be the most likely to appear in the interview.

In [0]:
from pyspark.sql.functions import col, lit, udf, abs, coalesce, when
from pyspark.sql.types import FloatType
from scipy.spatial.distance import cosine

# Map difficulty levels to numeric values
difficulty_map = {"Easy": 0, "Medium": 1, "Hard": 2}

# Change difficulty column to numeric
code_with_heuristic = code_scores_after_transformation.withColumn(
    "difficulty",
    when(col("difficulty") == "Easy", difficulty_map["Easy"])
    .when(col("difficulty") == "Medium", difficulty_map["Medium"])
    .when(col("difficulty") == "Hard", difficulty_map["Hard"])
).cache()

# Match question's difficulty to job posting's level
code_with_heuristic = code_with_heuristic.withColumn(
    "difficulty_match",
    1 - abs(col("difficulty") - col("level")) / 2) \
    .withColumn("difficulty_match", coalesce(col("difficulty_match"), lit(0.5))) \
    .drop("difficulty").cache()
    
open_with_heuristic = open_scores_after_transformation.withColumn(
    "difficulty_match",
    lit(0.5)
).drop("difficulty").cache()

# Normalize Acceptance for code questions
max_acceptance = code_with_heuristic.agg({"acceptance": "max"}).collect()[0][0]
code_with_heuristic = code_with_heuristic.withColumn(
    "normalized_acceptance", col("acceptance") / max_acceptance
).drop("acceptance").cache()
open_with_heuristic = open_with_heuristic.withColumn("normalized_acceptance", lit(0.5)).cache()

# Calculate Heuristic Score
def calculate_score(difficulty, similarity, acceptance):
    return 0.3 * difficulty + 0.5 * similarity + 0.2 * acceptance
calculate_score_udf = udf(calculate_score, FloatType())

mean_difficulty_match = code_with_heuristic.agg({"difficulty_match": "mean"}).collect()[0][0]
mean_similarity = open_with_heuristic.agg({"similarity": "mean"}).collect()[0][0]
mean_acceptance = code_with_heuristic.agg({"normalized_acceptance": "mean"}).collect()[0][0]

code_with_heuristic = code_with_heuristic.withColumn("difficulty_match", coalesce(col("difficulty_match"), lit(mean_difficulty_match))) \
    .withColumn("similarity", coalesce(col("similarity"), lit(mean_similarity))) \
    .withColumn("normalized_acceptance", coalesce(col("normalized_acceptance"), lit(mean_acceptance))) \
    .withColumn("heuristic_score",
    calculate_score_udf(
        col("difficulty_match"),
        col("similarity"),
        col("normalized_acceptance"),
    ),
).drop("difficulty_match", "similarity", "normalized_acceptance").cache()

mean_similarity = open_with_heuristic.agg({"similarity": "mean"}).collect()[0][0]
open_with_heuristic = open_with_heuristic.withColumn("similarity", coalesce(col("similarity"), lit(mean_similarity))) \
    .withColumn("heuristic_score",
    calculate_score_udf(
        col("difficulty_match"),
        col("similarity"),
        col("normalized_acceptance"),
    ),
).drop("difficulty_match", "similarity", "normalized_acceptance").cache()

In [0]:
# Select Top Questions for Each Job
from pyspark.sql import Window
from pyspark.sql.functions import row_number
import pandas as pd

job_cols = pd.read_csv(os.path.join(JOBS_PATH, "jobs_sample.csv"), header=0).columns
window_spec = Window.partitionBy(*job_cols).orderBy(col("heuristic_score").desc())

top_code_questions = code_with_heuristic.withColumn(
    "rank", row_number().over(window_spec)
).filter(col("rank") <= 20)

top_open_questions = open_with_heuristic.withColumn(
    "rank", row_number().over(window_spec)
).filter(col("rank") <= 20)

In [0]:
from consts import DATA_PATH
top_code_questions_df = top_code_questions.toPandas()
top_code_questions_df.to_csv(os.path.join(DATA_PATH, "top_code_questions.csv"), index=False)
top_open_questions_df = top_open_questions.toPandas()
top_open_questions_df.to_csv(os.path.join(DATA_PATH, "top_open_questions.csv"), index=False)