#### Visualization

In [0]:
from pyspark.sql import SparkSession
from consts import QUESTIONS_PATH, JOBS_PATH, open_csv_file

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Job and Interview Analysis") \
    .getOrCreate()
    
job_skills_spark = open_csv_file(spark, JOBS_PATH, 'all_jobpostings_with_skills.csv')
job_skills_spark.printSchema()

code_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_code_questions_with_topics.csv')
code_questions_spark.printSchema()

open_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_open_questions_with_topics.csv')
open_questions_spark.printSchema()


In [0]:
from pyspark.sql.functions import col, explode, split, count, avg
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import networkx as nx
import seaborn as sns

# Read Data from CSVs
job_data = job_skills_spark
code_questions_data = code_questions_spark
open_questions_data = open_questions_spark

# Helper Functions
def preprocess_skills(data):
    """Splits skills into individual entries and counts their occurrences."""
    skills = data.withColumn("skill", explode(split(col("skills"), ", "))) \
                 .groupBy("skill") \
                 .count() \
                 .orderBy(col("count").desc())
    return skills

# 1. Job Count by Field
def job_count_by_field(data):
    field_counts = data.groupBy("field").count().orderBy(col("count").desc()).toPandas()
    field_counts.plot(kind="bar", x="field", y="count", color="skyblue")
    plt.title("Job Count by Field")
    plt.xlabel("Field")
    plt.ylabel("Count")
    plt.show()

# 2. Most In-Demand Skills
def most_in_demand_skills(data):
    skills = preprocess_skills(data).toPandas()
    wordcloud = WordCloud(background_color="white").generate_from_frequencies(dict(zip(skills["skill"], skills["count"])))
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Most In-Demand Skills")
    plt.show()

# 3. Skill Distribution by Field
def skill_distribution_by_field(data):
    skills_field = data.withColumn("skill", explode(split(col("skills"), ", ")))
    skill_field_counts = skills_field.groupBy("field", "skill").count().toPandas()
    skill_pivot = skill_field_counts.pivot(index="field", columns="skill", values="count").fillna(0)
    skill_pivot.plot(kind="bar", stacked=True, figsize=(12, 6))
    plt.title("Skill Distribution by Field")
    plt.xlabel("Field")
    plt.ylabel("Count")
    plt.show()

# 4. Skills Comparison Across Levels
def skills_comparison_across_levels(data):
    level_skills = data.withColumn("skill", explode(split(col("skills"), ", ")))
    level_pivot = level_skills.groupBy("level", "skill").count().toPandas()
    pivot_table = level_pivot.pivot(index="level", columns="skill", values="count").fillna(0)
    sns.heatmap(pivot_table, annot=False, cmap="coolwarm", cbar=True)
    plt.title("Skills Comparison Across Levels")
    plt.xlabel("Skills")
    plt.ylabel("Job Level")
    plt.show()

# 5. Co-occurrence of Skills
def cooccurrence_of_skills(data):
    skills_pairs = data.withColumn("skills", split(col("skills"), ", ")).select("skills").rdd.flatMap(
        lambda row: [(a, b) for a in row.skills for b in row.skills if a != b]
    )
    G = nx.Graph()
    G.add_edges_from(skills_pairs.collect())
    plt.figure(figsize=(10, 10))
    nx.draw(G, with_labels=True, node_color="skyblue", node_size=2000, font_size=10, edge_color="gray")
    plt.title("Co-occurrence of Skills")
    plt.show()

# 6. Questions by Difficulty
def questions_by_difficulty(data):
    difficulty_counts = data.groupBy("difficulty").count().orderBy(col("count").desc()).toPandas()
    difficulty_counts.plot(kind="pie", y="count", labels=difficulty_counts["difficulty"], autopct="%1.1f%%")
    plt.title("Questions by Difficulty")
    plt.ylabel("")
    plt.show()

# 7. Most Common Topics
def most_common_topics(data):
    topics_counts = data.withColumn("topic", explode(split(col("topics"), ", "))) \
                        .groupBy("topic").count().orderBy(col("count").desc()).toPandas()
    topics_counts.plot(kind="bar", x="topic", y="count", color="orange")
    plt.title("Most Common Topics")
    plt.xlabel("Topics")
    plt.ylabel("Count")
    plt.show()

# 8. Acceptance Rate Analysis
def acceptance_rate_analysis(data):
    scatter_data = data.select("difficulty", "acceptance").toPandas()
    plt.scatter(scatter_data["difficulty"], scatter_data["acceptance"], color="purple", alpha=0.7)
    plt.title("Acceptance Rate Analysis")
    plt.xlabel("Difficulty")
    plt.ylabel("Acceptance Rate")
    plt.show()



In [0]:
# Call Functions to Generate Visualizations
questions_by_difficulty(code_questions_data)
most_common_topics(code_questions_data)
acceptance_rate_analysis(code_questions_data)

In [0]:
job_count_by_field(job_data)


In [0]:
most_in_demand_skills(job_data)


In [0]:
skill_distribution_by_field(job_data)


In [0]:
skills_comparison_across_levels(job_data)


In [0]:
cooccurrence_of_skills(job_data)

## simulation

---
The simulation - #TAKE_1

In [0]:
from api_keys import API_KEYS

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, explode, split
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
from pyspark.ml.feature import Word2VecModel, HashingTF, IDF
import google.generativeai as genai
import os
import PyPDF2

class InterviewSimulationApp:
    def __init__(self, gemini_api_key):
        # Initialize Spark Session
        self.spark = SparkSession.builder \
            .appName("InterviewSimulationApp") \
            .getOrCreate()
        
        # Configure Gemini
        genai.configure(api_key=gemini_api_key)
        self.model = genai.GenerativeModel('gemini-flash')
        
        # Load datasets using Spark DataFrame
        self.job_data = self.spark.read.csv('job_data.csv', header=True)
        self.interview_questions = self.spark.read.csv('interview_questions.csv', header=True)
        self.coding_questions = self.spark.read.json('coding_questions.json')
    
    def extract_pdf_text(self, pdf_path):
        """PDF text extraction using Spark UDF"""
        def extract_text(path):
            with open(path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                return ' '.join([page.extract_text() for page in reader.pages])
        
        extract_udf = udf(extract_text, StringType())
        return self.spark.read.text(pdf_path).withColumn("extracted_text", extract_udf(col("value")))
    
    def match_job_questions(self, cv_text, job_title):
        """Job matching and question generation"""
        # Filter job data
        job_match = self.job_data.filter(col("title").contains(job_title))
        
        # Generate questions using Gemini
        questions_prompt = f"""Generate interview questions for {job_title}
        considering CV context: {cv_text}"""
        
        questions = self.model.generate_content(questions_prompt).text
        
        # Convert questions to Spark DataFrame
        questions_df = self.spark.createDataFrame(
            [(q.strip(),) for q in questions.split('\n') if q.strip()],
            ['question']
        )
        
        return questions_df
    
    def generate_coding_challenges(self, experience_level):
        """Coding challenge selection"""
        difficulty_map = {
            'junior': ['easy', 'medium'],
            'senior': ['medium', 'hard']
        }
        
        challenges = self.coding_questions.filter(
            col('difficulty').isin(difficulty_map.get(experience_level, []))
        )
        
        return challenges.limit(3)
    
    def simulate_interview(self, cv_text, job_title):
        """Interview simulation workflow"""
        # Generate questions
        questions = self.match_job_questions(cv_text, job_title)
        
        # Select coding challenges
        coding_challenges = self.generate_coding_challenges('junior')
        
        return {
            'behavioral_questions': questions,
            'coding_challenges': coding_challenges
        }
    
    def analyze_performance(self, questions_df, answers_df):
        """Performance analysis"""
        # Join questions and answers
        performance_df = questions_df.join(answers_df, 'question')
        
        # Use Gemini for feedback generation
        def generate_feedback(questions, answers):
            feedback_prompt = f"""Analyze interview performance:
            Questions: {questions}
            Answers: {answers}
            Provide STAR method feedback"""
            
            return self.model.generate_content(feedback_prompt).text
        
        feedback_udf = udf(generate_feedback, StringType())
        
        feedback_df = performance_df.withColumn(
            'feedback', 
            feedback_udf(col('question'), col('answer'))
        )
        
        return feedback_df

def main():
    app = InterviewSimulationApp(
        gemini_api_key=os.getenv(API_KEY)
    )
    
    print("Upload CV or paste text for interview simulation")

if __name__ == '__main__':
    main()

---
1. **Initialization (`__init__`)**:
   - Sets up Spark Session with configurable parameters
   - Configures Gemini AI with API key
   - Downloads NLTK resources
   - Loads datasets (job data, interview questions, coding challenges)

2. **Dataset Loading (`_load_dataset`)**:
   - Supports loading CSV and JSON files
   - Handles different file types
   - Includes error handling for dataset loading

3. **PDF Text Extraction (`extract_pdf_text`)**:
   - Uses PyPDF2 to extract text from PDFs
   - Converts PDF content to Spark DataFrame
   - Includes safe extraction with error handling

4. **Text Preprocessing (`preprocess_text`)**:
   - Tokenizes text
   - Removes stopwords
   - Cleans and normalizes text
   - Prepares text for further analysis

5. **Semantic Job Matching (`semantic_job_matching`)**:
   - Uses Gemini to understand CV context
   - Filters interview questions based on job title
   - Provides contextually relevant questions

6. **Coding Challenge Generation (`generate_coding_challenges`)**:
   - Selects coding challenges based on experience level
   - Supports different difficulty levels (junior, mid-level, senior)
   - Limits number of challenges

7. **Performance Analysis (`interview_performance_analysis`)**:
   - Combines questions and answers
   - Generates feedback using STAR methodology
   - Provides structured performance assessment


---
## Technical Enhancements
1. **Machine Learning Improvements**
   - Implement advanced embedding techniques
   - Use transfer learning for question generation
   - Develop custom ML models for skill matching

2. **Distributed Computing Optimizations**
   - Implement dynamic resource allocation
   - Add caching mechanisms
   - Optimize Spark configurations

3. **AI Integration Enhancements**
   - Multi-model approach (Gemini + Local Models)
   - Real-time feedback generation
   - Contextual understanding improvements

4. **Data Management**
   - Create robust data versioning
   - Implement data quality checks
   - Add automated dataset refresh mechanisms

5. **Scalability Features**
   - Microservices architecture
   - Containerization (Docker)
   - Kubernetes orchestration

## User Experience Improvements
1. Interactive CLI/Web Interface
2. Personalized Learning Paths
3. Comprehensive Skill Gap Analysis
4. Multi-language Support

## Advanced Features
1. Continuous Learning Model
2. Industry-Specific Question Banks
3. Mock Interview Recordings
4. Adaptive Difficulty Scaling

In [0]:
import os
import logging
from typing import Dict, List

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, udf, explode, split, regexp_replace
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
from pyspark.ml.feature import Word2VecModel, HashingTF, IDF
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors

import google.generativeai as genai
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [0]:
class InterviewSimulationApp:
    def __init__(self, gemini_api_key: str, config: Dict = None):
        # Logging setup
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Spark Session with enhanced configurations
        spark_config = config or {}
        self.spark = (SparkSession.builder
            .appName("InterviewSimulationApp")
            .config("spark.sql.shuffle.partitions", spark_config.get('shuffle_partitions', 200))
            .config("spark.executor.memory", spark_config.get('executor_memory', '4g'))
            .getOrCreate())
        
        # Natural Language Processing Setup
        nltk.download('punkt')
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        
        # Gemini AI Configuration
        genai.configure(api_key=gemini_api_key)
        self.model = genai.GenerativeModel('gemini-flash')
        
        # Load Datasets - TODO:
        self.job_data = self._load_dataset('job_data.csv')
        self.interview_questions = self._load_dataset('interview_questions.csv')
        self.coding_questions = self._load_dataset('coding_questions.json')
    
    def _load_dataset(self, path: str) -> DataFrame:
        """Robust dataset loading with error handling"""
        try:
            if path.endswith('.csv'):
                return self.spark.read.csv(path, header=True, inferSchema=True)
            elif path.endswith('.json'):
                return self.spark.read.json(path)
            else:
                raise ValueError(f"Unsupported file type: {path}")
        except Exception as e:
            self.logger.error(f"Failed to load dataset {path}: {e}")
            return self.spark.createDataFrame([], StructType([]))
    
    def extract_pdf_text(self, pdf_path: str) -> DataFrame:
        """Advanced PDF text extraction with error handling"""
        def safe_extract_text(path: str) -> str:
            try:
                with open(path, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    return ' '.join([page.extract_text() for page in reader.pages])
            except Exception as e:
                self.logger.error(f"PDF extraction error: {e}")
                return ""
        
        extract_udf = udf(safe_extract_text, StringType())
        return self.spark.read.text(pdf_path).withColumn("extracted_text", extract_udf(col("value")))
    
    def preprocess_text(self, text_df: DataFrame) -> DataFrame:
        """Advanced text preprocessing"""
        def tokenize_and_clean(text: str) -> List[str]:
            tokens = word_tokenize(text.lower())
            return [token for token in tokens if token.isalnum() and token not in self.stop_words]
        
        tokenize_udf = udf(tokenize_and_clean, ArrayType(StringType()))
        
        return text_df.withColumn("processed_tokens", tokenize_udf(col("extracted_text")))
    
    def semantic_job_matching(self, cv_text: str, job_title: str) -> DataFrame:
        """Advanced semantic job matching with embeddings"""
        # Use Gemini for contextual understanding
        job_context = self.model.generate_content(
            f"Extract key skills and experience relevant to {job_title} from: {cv_text}"
        ).text
        
        # Filter and rank job questions
        matched_questions = self.interview_questions.filter(
            col("job_category").contains(job_title)
        )
        
        return matched_questions
    
    # TODO: Add more advanced semantic matching
    def generate_coding_challenges(self, experience_level: str, num_challenges: int = 3) -> DataFrame:
        """Intelligent coding challenge selection"""
        difficulty_mapping = {
            'junior': ['easy', 'medium'],
            'mid-level': ['medium'],
            'senior': ['medium', 'hard']
        }
        
        difficulties = difficulty_mapping.get(experience_level, ['easy', 'medium'])
        
        return self.coding_questions.filter(
            col('difficulty').isin(difficulties)
        ).limit(num_challenges)
    
    def interview_performance_analysis(self, questions_df: DataFrame, answers_df: DataFrame) -> DataFrame:
        """Advanced performance analysis using AI and distributed computing"""
        def generate_star_feedback(questions: List[str], answers: List[str]) -> str:
            feedback_prompt = f"""Analyze interview performance using STAR method:
            Questions: {questions}
            Answers: {answers}
            
            Provide:
            1. Strengths
            2. Areas of Improvement
            3. Overall Assessment"""
            
            return self.model.generate_content(feedback_prompt).text
        
        feedback_udf = udf(generate_star_feedback, StringType())
        
        return questions_df.join(answers_df, "question_id").withColumn(
            "performance_feedback", 
            feedback_udf(col("questions"), col("answers"))
        )


In [0]:
def main():
    # Initialize the application
    app = InterviewSimulationApp(
        gemini_api_key=os.getenv(API_KEY),
        config={
            'shuffle_partitions': 300,
            'executor_memory': '8g'
        }
    )
    
    print("Welcome to Advanced Interview Simulation!")
    print("You can upload CV or paste text directly.")

    # TODO: ADD OUR CODE HERE

if __name__ == '__main__':
    main()
