### 📊Visualizations for the unprocessed data

In [0]:
from pyspark.sql import SparkSession
from consts import QUESTIONS_PATH, JOBS_PATH, open_csv_file

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Job and Interview Analysis") \
    .getOrCreate()
    
job_skills_spark = open_csv_file(spark, JOBS_PATH, 'all_jobpostings_with_skills.csv')
open_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_open_questions_with_topics.csv')
code_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_code_questions_with_topics.csv')


#### Top 10 most common job titles for each seniority level

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from pyspark.sql import functions as F
from pyspark.sql.functions import when, col, udf
from pyspark.sql.types import StringType
import seaborn as sns

# Map levels back to seniority level
seniority_map = {
    "0": "Internship",
    "1": "Entry level/Associate",
    "2": "Mid-Senior level/Manager and above"
}

# Function to normalize job titles (remove seniority/level indicators like "Senior", "I", etc.)
def normalize_job_title(title):
    # Define common seniority indicators and level indicators
    seniority_terms = ['senior', 'junior', 'lead', 'principal', 'assistant', 'associate']
    level_terms = ['i', 'ii', 'iii', 'iv', 'v']  # Handles "I", "II", etc.

    # Normalize case and remove special characters except spaces
    title = re.sub(r"[^a-zA-Z\s]", "", title.lower())

    # Remove seniority and level terms
    words = title.split()
    filtered_words = [word for word in words if word not in seniority_terms and word not in level_terms]

    return " ".join(filtered_words)

# Register the function as a UDF for Spark
normalize_job_title_udf = udf(normalize_job_title, StringType())

# Clean Data (drop NaN and lowercase)
job_skills_cleaned = job_skills_spark \
    .dropna(subset=["skills", "job_summary", "company_industry", "field", "job_title"]) \
    .withColumn("job_title_normalized", normalize_job_title_udf(col("job_title"))) \
    .withColumn("field", F.lower(col("field"))) \
    .withColumn("seniority_level",
        when(col("level") == "0", seniority_map["0"])
        .when(col("level") == "1", seniority_map["1"])
        .when(col("level") == "2", seniority_map["2"])
    ).drop("level").cache()

# Group by normalized job title and seniority level, then count occurrences
level_by_title = job_skills_cleaned.groupBy("job_title_normalized", "seniority_level") \
    .count().toPandas()

# Define the correct order for seniority levels
seniority_order = ["Internship", "Entry level/Associate", "Mid-Senior level/Manager and above"]

# Convert the seniority_level column into a categorical type with the defined order
level_by_title["seniority_level"] = pd.Categorical(
    level_by_title["seniority_level"], 
    categories=seniority_order, 
    ordered=True
)

# Find the **top 10** most common job titles for each seniority level
top_titles_per_level = (
    level_by_title.sort_values(by="count", ascending=False)
    .groupby("seniority_level")
    .head(10)  # Select top 10 per level
)

# Create separate plots for each seniority level
fig, axes = plt.subplots(3, 1, figsize=(16, 15))

for ax, level in zip(axes, seniority_order):
    data = top_titles_per_level[top_titles_per_level["seniority_level"] == level]
    
    sns.barplot(
        x='job_title_normalized', y='count', data=data, 
        palette="Set2", ax=ax, edgecolor=None
    )
    
    ax.set_title(f"Top 10 Most Common {level} Job Titles", fontsize=16)
    ax.set_ylabel("Count", fontsize=14)
    
    # Rotate labels and adjust their position
    ax.set_xticklabels(
        data['job_title_normalized'],  # Explicitly provide the labels
        rotation=45,
        ha='right',  # Horizontal alignment
        rotation_mode='anchor'  # Rotation point
    )
    
    # Ensure x-axis label is visible
    ax.set_xlabel("Job Title", fontsize=14)
    
    # Adjust layout to prevent label cutoff
    ax.tick_params(axis='x', labelsize=10)

# Adjust the layout to prevent overlap
plt.tight_layout(pad=2.0)  # Increase padding between subplots
plt.show()



#### word cloud on the topics for the different categories

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, split, explode, trim, collect_list, lit, array_except, array
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Create a Spark session
spark = SparkSession.builder.appName("WordCloudVisualization").getOrCreate()

# Define stop words, including "skill" and "skills"
stop_words = ["and", "of", "the", "a", "in", "to", "skill", "skills"]

# Process topics column (handle nulls, lowercase, split, explode, trim, and remove stop words)
processed_df = (
    open_questions_spark
    .filter(col("topics").isNotNull())  # Exclude rows with null topics
    .withColumn("topics", lower(col("topics")))  # Convert to lowercase
    .withColumn("topics", split(col("topics"), ",\\s*"))  # Split by commas
    .withColumn("topics", explode(col("topics")))  # Explode into individual sub-topics
    .withColumn("topics", trim(col("topics")))  # Trim whitespace
    .withColumn("topics", split(col("topics"), "\\s+"))  # Split multi-word phrases into words
    .withColumn("topics", explode(col("topics")))  # Explode words into separate rows
    .withColumn("topics", array_except(array(col("topics")), lit(stop_words)))  # Remove stop words
)

# Aggregate topics for each category
category_topics_df = (
    processed_df
    .groupBy("category")
    .agg(collect_list("topics").alias("all_topics"))  # Aggregate topics into a list
)

# Convert PySpark DataFrame to Pandas for visualization
category_topics_pd = category_topics_df.toPandas()

# Generate and visualize word clouds
for index, row in category_topics_pd.iterrows():
    category = row["category"]
    topics = " ".join([str(item) for sublist in row["all_topics"] for item in sublist])

    # Generate a word cloud
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color="white",
        colormap="viridis",
        max_words=100,
        contour_color="black",
        contour_width=1
    ).generate(topics)
    
    # Plot the word cloud using matplotlib
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for Category: {category}", fontsize=16)
    plt.show()


#### WordCloud for most in-demand skills


In [0]:
from pyspark.sql.functions import col, explode, split, count, avg
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Helper Functions
def preprocess_skills(data):
    """Splits skills into individual entries and counts their occurrences."""
    skills = data.withColumn("skill", explode(split(col("skills"), ", "))) \
                 .groupBy("skill") \
                 .count() \
                 .orderBy(col("count").desc())
    return skills

# Most In-Demand Skills
def most_in_demand_skills(data):
    skills = preprocess_skills(data).toPandas()
    wordcloud = WordCloud(background_color="white").generate_from_frequencies(dict(zip(skills["skill"], skills["count"])))
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Most In-Demand Skills")
    plt.show()

most_in_demand_skills(job_skills_spark)

#### Questions' Difficulty level distribution

In [0]:
from pyspark.sql.functions import col, explode, split, count, avg
import matplotlib.pyplot as plt
import seaborn as sns

# Questions by Difficulty
def questions_by_difficulty(data):
    difficulty_counts = data.groupBy("difficulty").count().orderBy(col("count").desc()).toPandas()
    difficulty_counts.plot(kind="pie", y="count", labels=difficulty_counts["difficulty"], colors=sns.color_palette('Set2'), autopct="%1.1f%%")
    plt.title("Questions by Difficulty")
    plt.ylabel("")
    plt.show()

questions_by_difficulty(code_questions_spark)

#### difficulty vs. acceptance + difficulty vs. num of topics

In [0]:
from pyspark.sql.functions import col, split, size
import matplotlib.pyplot as plt
import seaborn as sns

code_questions_spark = code_questions_spark.withColumn("num_topics", size(split(col("topics"), ",")))

df_grouped = code_questions_spark.groupBy("difficulty").agg(
    {"acceptance": "avg", "num_topics": "avg"}
).withColumnRenamed("avg(acceptance)", "avg_acceptance").withColumnRenamed("avg(num_topics)", "avg_num_topics")

df_grouped_pd = df_grouped.toPandas()

plt.figure(figsize=(12, 6))
# Plot difficulty vs average acceptance
plt.subplot(1, 2, 1)
sns.barplot(x="difficulty", y="avg_acceptance", data=df_grouped_pd, palette = sns.color_palette("Set2"))
plt.title("Difficulty vs Average Acceptance")
plt.xlabel("Difficulty")
plt.ylabel("Average Acceptance")

# Plot difficulty vs average number of topics
plt.subplot(1, 2, 2)
sns.barplot(x="difficulty", y="avg_num_topics", data=df_grouped_pd, palette = sns.color_palette("Set2"))
plt.title("Difficulty vs Average Number of Topics")
plt.xlabel("Difficulty")
plt.ylabel("Average Number of Topics")

plt.tight_layout()
plt.show()

### 📊Visualizations for the processed data

In [0]:
from pyspark.sql import SparkSession
from consts import DATA_PATH, QUESTIONS_PATH, open_csv_file

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Job and Interview Analysis") \
    .getOrCreate()
    
top_code_spark = open_csv_file(spark, DATA_PATH, 'top_code_questions.csv')
all_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_code_questions_with_topics.csv').select("question_id", "difficulty")

top_jointed_code = top_code_spark.join(all_questions_spark, on="question_id", how="left")
top_open_spark = open_csv_file(spark, DATA_PATH, 'top_open_questions.csv')


#### Code questions analysis

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

pd_top_jointed_code = top_jointed_code.toPandas()
sns.set_theme(style="whitegrid")
colors = sns.color_palette("husl", 8)

fig, axes = plt.subplots(3, 2, figsize=(20, 20))

# Distribution of Seniority Levels
level_counts = pd_top_jointed_code['level'].value_counts().sort_index()
axes[0,0].pie(level_counts.values, labels=['Junior', 'Mid', 'Senior'], 
              autopct='%1.1f%%', colors=colors[:3],
              wedgeprops=dict(width=0.7))
axes[0,0].set_title('Distribution of Seniority Levels', pad=20)

# Top 10 Required Skills
all_skills = [skill.strip() for skills in pd_top_jointed_code['skills'].dropna() 
              for skill in skills.split(',')]
top_skills = pd.Series(Counter(all_skills)).sort_values(ascending=True)[-10:]
sns.barplot(y=top_skills.index, x=top_skills.values, palette=colors, ax=axes[0,1])
axes[0,1].set_title('Top 10 Required Skills', pad=20)
axes[0,1].set_xlabel('Frequency')

# Question Topics Distribution
all_topics = [topic.strip() for topics in pd_top_jointed_code['topics'].dropna() 
              for topic in topics.split(',')]
top_topics = pd.Series(Counter(all_topics)).sort_values(ascending=True)[-10:]
sns.barplot(y=top_topics.index, x=top_topics.values, palette=colors, ax=axes[1,0])
axes[1,0].set_title('Top 10 Question Topics', pad=20)
axes[1,0].set_xlabel('Frequency')

# Top Industries
all_industries = [ind.strip() for industries in pd_top_jointed_code['company_industry'].dropna() 
                 for ind in industries.split(',')]
all_industries = [ind for ind in all_industries if ind != "-"]
top_industries = pd.Series(Counter(all_industries)).sort_values(ascending=True)[-10:]
sns.barplot(y=top_industries.index, x=top_industries.values, palette=colors, ax=axes[1,1])
axes[1,1].set_title('Top 10 Industries', pad=20)
axes[1,1].set_xlabel('Frequency')

# Difficulty Distribution by Level
difficulty_level = pd.crosstab(pd_top_jointed_code['difficulty'], 
                             pd_top_jointed_code['level'])
# Reorder the index
difficulty_level = difficulty_level.reindex(['Easy', 'Medium', 'Hard'])

difficulty_level.plot(kind='bar', stacked=True, color=colors[:3], ax=axes[2,0])
axes[2,0].set_title('Question Difficulty Distribution by Seniority Level', pad=20)
axes[2,0].set_xlabel('Difficulty')
axes[2,0].set_ylabel('Count')
axes[2,0].tick_params(axis='x', rotation=0)
axes[2,0].legend(['Junior', 'Mid', 'Senior'], bbox_to_anchor=(1, 1))

# Top Companies by Average Score
company_scores = pd_top_jointed_code.groupby('company_name')['heuristic_score'].mean()\
                .sort_values(ascending=True).tail(10)
sns.barplot(y=company_scores.index, x=company_scores.values, palette=colors, ax=axes[2,1])
axes[2,1].set_title('Top 10 Companies by Average Heuristic Score', pad=20)
axes[2,1].set_xlabel('Average Score')

# Add labels on bars for better readability
for bar, score in zip(axes[2,1].containers[0], company_scores.values):
    axes[2,1].bar_label(axes[2,1].containers[0], fmt='%.3f', label_type='center', padding=5)

plt.tight_layout(pad=3.0)
plt.show()


#### Open questions analysis

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.functions import explode, split, count

# Convert Spark DataFrame to Pandas DataFrame
top_open_pd = top_open_spark.toPandas()

# Set the seaborn theme for better aesthetics
sns.set_theme(style="whitegrid")

# Distribution of Questions by Category
plt.figure(figsize=(10, 6))
category_counts = top_open_pd['category'].value_counts()
colors = sns.color_palette("Blues", len(category_counts))
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', 
        colors=colors, wedgeprops={'edgecolor': 'white'})
plt.title('Distribution of Questions by Category', pad=20, fontsize=14)
plt.show()

# Distribution of Questions by Field
plt.figure(figsize=(10, 6))
all_fields = top_open_pd['field'].str.split(r', and |, ').explode().str.strip()
top_fields = all_fields.value_counts().head(10)
sns.barplot(x=top_fields.values, y=top_fields.index, palette='husl')
plt.title('Top 10 Job Fields', pad=20, fontsize=14)
plt.xlabel('Count')
plt.ylabel('Fields')
plt.tight_layout()
plt.show()

# Distribution of Topics
plt.figure(figsize=(10, 6))
all_topics = top_open_pd['topics'].str.split(',').explode().str.strip()
top_topics = all_topics.value_counts().head(10)
sns.barplot(x=top_topics.values, y=top_topics.index, palette='husl')
plt.title('Top 10 Question Topics', pad=20, fontsize=14)
plt.xlabel('Count')
plt.ylabel('Topics')
plt.tight_layout()
plt.show()
