### Visualizations and EDA

In [0]:
from pyspark.sql import SparkSession
from consts import QUESTIONS_PATH, JOBS_PATH, open_csv_file

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Job and Interview Analysis") \
    .getOrCreate()
    
job_skills_spark = open_csv_file(spark, JOBS_PATH, 'all_jobpostings_with_skills.csv')
open_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_open_questions_with_topics.csv')
code_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_code_questions_with_topics.csv')


#### check the most common seniority level for each job field and job title and present the top 30

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from pyspark.sql import functions as F
from pyspark.sql.functions import when, col, udf
from pyspark.sql.types import StringType
import seaborn as sns

# Map levels back to seniority level
seniority_map = {
    "0": "Internship",
    "1": "Entry level/Associate",
    "2": "Mid-Senior level/Manager and above"
}

# Function to normalize job titles (remove seniority/level indicators like "Senior", "I", etc.)
def normalize_job_title(title):
    # Define common seniority indicators and level indicators
    seniority_terms = ['senior', 'junior', 'lead', 'principal', 'assistant', 'associate']
    level_terms = ['i', 'ii', 'iii', 'iv', 'v']  # Handles "I", "II", etc.

    # Normalize case and remove special characters except spaces
    title = re.sub(r"[^a-zA-Z\s]", "", title.lower())

    # Remove seniority and level terms
    words = title.split()
    filtered_words = [word for word in words if word not in seniority_terms and word not in level_terms]

    return " ".join(filtered_words)

# Register the function as a UDF for Spark
normalize_job_title_udf = udf(normalize_job_title, StringType())

# Clean Data (drop NaN and lowercase)
job_skills_cleaned = job_skills_spark \
    .dropna(subset=["skills", "job_summary", "company_industry", "field", "job_title"]) \
    .withColumn("job_title_normalized", normalize_job_title_udf(col("job_title"))) \
    .withColumn("field", F.lower(col("field"))) \
    .withColumn("seniority_level",
        when(col("level") == "0", seniority_map["0"])
        .when(col("level") == "1", seniority_map["1"])
        .when(col("level") == "2", seniority_map["2"])
    ).drop("level").cache()

# Group by normalized job title and field, then count the most common seniority levels
level_by_title_field = job_skills_cleaned.groupBy("field", "job_title_normalized", "seniority_level") \
    .count().toPandas()

# Define the correct order for seniority levels
seniority_order = ["Internship", "Entry level/Associate", "Mid-Senior level/Manager and above"]

# Convert the seniority_level column into a categorical type with the defined order
level_by_title_field["seniority_level"] = pd.Categorical(
    level_by_title_field["seniority_level"], 
    categories=seniority_order, 
    ordered=True
)

# Sort by count and select the top 50 most common job titles and fields
top_50 = level_by_title_field.sort_values(by="count", ascending=False).head(50)

# Create separate plots for each seniority level
fig, axes = plt.subplots(3, 1, figsize=(16, 15), sharex=True)

for ax, level in zip(axes, seniority_order):
    data = top_50[top_50["seniority_level"] == level]
    
    sns.barplot(
        x='job_title_normalized', y='count', data=data, 
        palette="Set2", ax=ax, edgecolor=None  # Remove black lines
    )
    
    ax.set_title(f"{level} Job Titles", fontsize=16)
    ax.set_ylabel("Count", fontsize=14)
    ax.set_xlabel("Job Title", fontsize=14)
    ax.tick_params(axis='x', rotation=90, labelsize=12)
    
plt.tight_layout()
plt.show()


#### word cloud on the topics for the different categories

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, split, explode, trim, collect_list, lit, array_except, array
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Create a Spark session
spark = SparkSession.builder.appName("WordCloudVisualization").getOrCreate()

# Define stop words, including "skill" and "skills"
stop_words = ["and", "of", "the", "a", "in", "to", "skill", "skills"]

# Process topics column (handle nulls, lowercase, split, explode, trim, and remove stop words)
processed_df = (
    open_questions_spark
    .filter(col("topics").isNotNull())  # Exclude rows with null topics
    .withColumn("topics", lower(col("topics")))  # Convert to lowercase
    .withColumn("topics", split(col("topics"), ",\\s*"))  # Split by commas
    .withColumn("topics", explode(col("topics")))  # Explode into individual sub-topics
    .withColumn("topics", trim(col("topics")))  # Trim whitespace
    .withColumn("topics", split(col("topics"), "\\s+"))  # Split multi-word phrases into words
    .withColumn("topics", explode(col("topics")))  # Explode words into separate rows
    .withColumn("topics", array_except(array(col("topics")), lit(stop_words)))  # Remove stop words
)

# Aggregate topics for each category
category_topics_df = (
    processed_df
    .groupBy("category")
    .agg(collect_list("topics").alias("all_topics"))  # Aggregate topics into a list
)

# Convert PySpark DataFrame to Pandas for visualization
category_topics_pd = category_topics_df.toPandas()

# Generate and visualize word clouds
for index, row in category_topics_pd.iterrows():
    category = row["category"]
    topics = " ".join([str(item) for sublist in row["all_topics"] for item in sublist])

    # Generate a word cloud
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color="white",
        colormap="viridis",
        max_words=100,
        contour_color="black",
        contour_width=1
    ).generate(topics)
    
    # Plot the word cloud using matplotlib
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for Category: {category}", fontsize=16)
    plt.show()


#### difficulty vs. acceptance + difficulty vs. num of topics

In [0]:
from pyspark.sql.functions import col, split, size
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Add a column for the number of topics
code_questions_spark = code_questions_spark.withColumn("num_topics", size(split(col("topics"), ",")))

# Step 2: Group by difficulty and calculate the average acceptance and average number of topics
df_grouped = code_questions_spark.groupBy("difficulty").agg(
    {"acceptance": "avg", "num_topics": "avg"}
).withColumnRenamed("avg(acceptance)", "avg_acceptance").withColumnRenamed("avg(num_topics)", "avg_num_topics")

# Step 3: Convert the result to Pandas for plotting
df_grouped_pd = df_grouped.toPandas()

# Step 4: Plot the results
plt.figure(figsize=(12, 6))

# Plot difficulty vs average acceptance
plt.subplot(1, 2, 1)
sns.barplot(x="difficulty", y="avg_acceptance", data=df_grouped_pd, palette = sns.color_palette("Set2"))
plt.title("Difficulty vs Average Acceptance")
plt.xlabel("Difficulty")
plt.ylabel("Average Acceptance")

# Plot difficulty vs average number of topics
plt.subplot(1, 2, 2)
sns.barplot(x="difficulty", y="avg_num_topics", data=df_grouped_pd, palette = sns.color_palette("Set2"))
plt.title("Difficulty vs Average Number of Topics")
plt.xlabel("Difficulty")
plt.ylabel("Average Number of Topics")

plt.tight_layout()
plt.show()


#### word cloud for skills or job summary, see the relationship between field and company_industry


In [0]:
import pandas as pd
from pyspark.sql import functions as F
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import plotly.express as px

# Assuming 'job_skills_spark' is a PySpark DataFrame

# Step 1: Clean Data (drop NaN and lowercase)
job_skills_cleaned = job_skills_spark \
    .dropna(subset=["skills", "job_summary", "company_industry", "field"]) \
    .withColumn("skills", F.lower(F.col("skills"))) \
    .withColumn("job_summary", F.lower(F.col("job_summary"))) \
    .withColumn("company_industry", F.lower(F.col("company_industry"))) \
    .withColumn("field", F.lower(F.col("field")))

# Step 2: Process Skills Column for Word Cloud
# Split skills by comma, then join all to create a single string
skills_df = job_skills_cleaned.select(F.explode(F.split(F.col("skills"), ",")).alias("skill"))
skills_list = skills_df.rdd.map(lambda row: row.skill).collect()

# Step 3: Create Word Cloud for Skills
skills_wordcloud = WordCloud(stopwords=ENGLISH_STOP_WORDS, background_color="white", width=800, height=400).generate(" ".join(skills_list))

# Step 4: Process Job Summary for Word Cloud
# Remove stop words and generate word cloud for job summary
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])

job_summary_list = job_skills_cleaned.rdd.map(lambda row: remove_stopwords(row.job_summary)).collect()

job_summary_wordcloud = WordCloud(stopwords=ENGLISH_STOP_WORDS, background_color="white", width=800, height=400).generate(" ".join(job_summary_list))

# Step 5: Visualize the Relationship between 'field' and 'company_industry' (e.g., bar plot)
# Create a DataFrame to count occurrences of combinations of 'field' and 'company_industry'
field_industry_counts = job_skills_cleaned.groupBy("field", "company_industry").count().toPandas()

# Step 6: Plot the Word Clouds
plt.figure(figsize=(10, 10))
plt.subplot(2, 1, 1)
plt.imshow(skills_wordcloud, interpolation="bilinear")
plt.title("Word Cloud for Skills")
plt.axis("off")

plt.subplot(2, 1, 2)
plt.imshow(job_summary_wordcloud, interpolation="bilinear")
plt.title("Word Cloud for Job Summary")
plt.axis("off")

plt.tight_layout()
plt.show()


In [0]:

# Step 7: Plot the Relationship between 'field' and 'company_industry' using Plotly
fig = px.bar(field_industry_counts, x='field', y='count', color='company_industry',
             labels={"field": "Field", "count": "Count", "company_industry": "Company Industry"},
             title="Relationship Between Field and Company Industry")
fig.show()

In [0]:
from pyspark.sql.functions import lower, explode, split
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd
import plotly.graph_objects as go
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Download required NLTK data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Process skills
# Convert to lowercase and split by comma
skills_df = job_skills_spark.select(
    explode(split(lower("skills"), ",")).alias("skill")
).dropna()

# Convert to pandas for visualization
skills_pd = skills_df.toPandas()
skills_text = " ".join(skills_pd["skill"].str.strip())

# Create word cloud for skills
plt.figure(figsize=(12, 8))
wordcloud_skills = WordCloud(width=800, height=400,
                           background_color='white',
                           min_font_size=10).generate(skills_text)

plt.figure(figsize=(12, 8))
plt.imshow(wordcloud_skills, interpolation='bilinear')
plt.axis('off')
plt.title('Skills Word Cloud')
plt.show()

# Process job summary
# Convert to lowercase and remove stop words
job_summary_df = job_skills_spark.select(lower("job_summary").alias("summary")).dropna()
job_summary_pd = job_summary_df.toPandas()

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        words = text.lower().split()
        return " ".join([word for word in words if word not in stop_words])
    return ""

job_summary_text = " ".join(job_summary_pd["summary"].apply(clean_text))

# Create word cloud for job summary
plt.figure(figsize=(12, 8))
wordcloud_summary = WordCloud(width=800, height=400,
                            background_color='white',
                            min_font_size=10).generate(job_summary_text)

plt.imshow(wordcloud_summary, interpolation='bilinear')
plt.axis('off')
plt.title('Job Summary Word Cloud')
plt.show()


In [0]:
# Process field and company_industry relationship
# Split and explode both columns
relationship_df = job_skills_spark.select(
    explode(split(lower("field"), ",")).alias("field"),
    explode(split(lower("company_industry"), ",")).alias("industry")
).dropna()

# Convert to pandas and count relationships
relationship_pd = relationship_df.toPandas()
relationship_counts = relationship_pd.groupby(['field', 'industry']).size().reset_index(name='count')

# Create node lists and mapping
nodes = list(set(relationship_counts['field'].unique()) | set(relationship_counts['industry'].unique()))
node_to_idx = {node: idx for idx, node in enumerate(nodes)}

# Create source, target, and value lists for Sankey diagram
sources = [node_to_idx[field] for field in relationship_counts['field']]
targets = [node_to_idx[industry] for industry in relationship_counts['industry']]
values = relationship_counts['count']

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        color="blue"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
)])

fig.update_layout(
    title_text="Relationship between Fields and Company Industries",
    font_size=12,
    height=800
)

fig.show()

In [0]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pyspark.sql.functions import lower, explode, split

# Process field and company_industry relationship
relationship_df = job_skills_spark.select(
    explode(split(lower("field"), ",")).alias("field"),
    explode(split(lower("company_industry"), ",")).alias("industry")
).dropna()

# Convert to pandas and count relationships
relationship_pd = relationship_df.toPandas()
relationship_counts = relationship_pd.groupby(['field', 'industry']).size().reset_index(name='count')

# Create a graph
G = nx.Graph()

# Add edges with weights
for _, row in relationship_counts.iterrows():
    G.add_edge(row['field'], row['industry'], weight=row['count'])

# Get maximum weight for normalization
max_weight = max(dict(G.edges()).values(), key=lambda x: x['weight'])['weight']

# Set up the plot
plt.figure(figsize=(10, 10))

# Create layout
pos = nx.spring_layout(G, k=1, iterations=50)

# Draw the network
# Edges with fixed width calculation
edge_weights = [G[u][v]['weight']/max_weight * 5 for u,v in G.edges()]
nx.draw_networkx_edges(G, pos, alpha=0.2, width=edge_weights)

# Nodes
nx.draw_networkx_nodes(G, pos, 
                      node_color='lightblue',
                      node_size=2000,
                      alpha=0.6)

# Labels
nx.draw_networkx_labels(G, pos, 
                       font_size=8,
                       font_weight='bold')

# Add title and remove axes
plt.title("Industry-Field Relationships", fontsize=16, pad=20)
plt.axis('off')

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()


In [0]:
# Create filtered version
# Calculate threshold
threshold = relationship_counts['count'].median()

# Create filtered graph
G_filtered = nx.Graph()

for _, row in relationship_counts[relationship_counts['count'] > threshold].iterrows():
    G_filtered.add_edge(row['field'], row['industry'], weight=row['count'])

# Get maximum weight for filtered graph
max_weight_filtered = max(dict(G_filtered.edges()).values(), key=lambda x: x['weight'])['weight']

# Create filtered visualization
plt.figure(figsize=(15, 15))

pos_filtered = nx.spring_layout(G_filtered, k=1, iterations=50)

# Draw edges with fixed width calculation
edge_weights_filtered = [G_filtered[u][v]['weight']/max_weight_filtered * 5 for u,v in G_filtered.edges()]
nx.draw_networkx_edges(G_filtered, pos_filtered, alpha=0.4, width=edge_weights_filtered)

nx.draw_networkx_nodes(G_filtered, pos_filtered,
                      node_color='lightblue',
                      node_size=2000,
                      alpha=0.6)

nx.draw_networkx_labels(G_filtered, pos_filtered,
                       font_size=10,
                       font_weight='bold')

plt.title("Industry-Field Relationships (Strong Connections Only)", fontsize=16, pad=20)
plt.axis('off')
plt.tight_layout()
plt.show()

In [0]:
from pyspark.sql.functions import col, explode, split, count, avg
import matplotlib.pyplot as plt
from wordcloud import WordCloud

job_data = job_skills_spark # Read Data from CSV

# Helper Functions
def preprocess_skills(data):
    """Splits skills into individual entries and counts their occurrences."""
    skills = data.withColumn("skill", explode(split(col("skills"), ", "))) \
                 .groupBy("skill") \
                 .count() \
                 .orderBy(col("count").desc())
    return skills

# Most In-Demand Skills
def most_in_demand_skills(data):
    skills = preprocess_skills(data).toPandas()
    wordcloud = WordCloud(background_color="white").generate_from_frequencies(dict(zip(skills["skill"], skills["count"])))
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Most In-Demand Skills")
    plt.show()

most_in_demand_skills(job_data)

In [0]:
from pyspark.sql.functions import col, explode, split, count, avg
import matplotlib.pyplot as plt
import seaborn as sns

code_questions_data = code_questions_spark # Read Data from CSV

# Questions by Difficulty
def questions_by_difficulty(data):
    difficulty_counts = data.groupBy("difficulty").count().orderBy(col("count").desc()).toPandas()
    difficulty_counts.plot(kind="pie", y="count", labels=difficulty_counts["difficulty"], colors=sns.color_palette('Set2'), autopct="%1.1f%%")
    plt.title("Questions by Difficulty")
    plt.ylabel("")
    plt.show()
questions_by_difficulty(code_questions_data)


### Visualizations for the processed data

In [0]:
from pyspark.sql import SparkSession
from consts import DATA_PATH, QUESTIONS_PATH, open_csv_file

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Job and Interview Analysis") \
    .getOrCreate()
    
top_code_spark = open_csv_file(spark, DATA_PATH, 'top_code_questions.csv')
all_questions_spark = open_csv_file(spark, QUESTIONS_PATH, 'all_code_questions_with_topics.csv').select("question_id", "difficulty")

top_jointed_code = top_code_spark.join(all_questions_spark, on="question_id", how="left")
display(top_jointed_code.limit(5))

top_open_spark = open_csv_file(spark, DATA_PATH, 'top_open_questions.csv')
display(top_open_spark.limit(5))

explanation for the Code plots below 👇
1. Question Difficulty and Compatibility: <br>
* There's a clear relationship between question difficulty and compatibility scores
* We can see how different difficulty levels map to different seniority requirements

2. Seniority Level Distribution: <br>
* The distribution of questions across seniority levels shows the focus of hiring
* We can see which level has the most compatible questions

3. Skills and Topics Analysis: <br>
* The top required skills visualization shows which technical skills are most in demand
* The question topics distribution reveals what technical areas are most commonly tested
* This can help in understanding the alignment between job requirements and interview questions

4. Industry and Company Insights: <br>
* The top industries visualization shows which sectors are most active in technical hiring
* We can see patterns in how different industries approach technical questions
* The company analysis reveals which organizations have the highest compatibility scores

5. Question Similarity Impact: <br>
* The scatter plot shows the relationship between question similarity and compatibility scores
* This helps understand if commonly asked questions are more or less compatible with job requirements

6. Level-based Insights: <br>
* The difficulty distribution by level shows how question complexity varies with seniority
* The average compatibility scores by level reveal how well questions match different career stages

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

# Read the CSV file
df = top_jointed_code.toPandas()

# Set the style for better-looking plots
plt.style.use('seaborn')

# 1. Distribution of Question Difficulties vs Average Heuristic Score
plt.figure(figsize=(10, 6))
difficulty_scores = df.groupby('difficulty')['heuristic_score'].mean().sort_values(ascending=False)
sns.barplot(x=difficulty_scores.index, y=difficulty_scores.values)
plt.title('Average Compatibility Score by Question Difficulty')
plt.ylabel('Average Heuristic Score')
plt.show()

# 2. Distribution of Seniority Levels
plt.figure(figsize=(8, 8))
level_counts = df['level'].value_counts().sort_index()
plt.pie(level_counts.values, labels=['Junior', 'Mid', 'Senior'], autopct='%1.1f%%')
plt.title('Distribution of Seniority Levels')
plt.show()

# 3. Top 10 Required Skills
plt.figure(figsize=(10, 6))
all_skills = [skill.strip() for skills in df['skills'].dropna() for skill in skills.split(',')]
top_skills = pd.Series(Counter(all_skills)).sort_values(ascending=True)[-10:]
sns.barplot(y=top_skills.index, x=top_skills.values)
plt.title('Top 10 Required Skills')
plt.xlabel('Frequency')
plt.show()

# 4. Question Topics Distribution
plt.figure(figsize=(10, 6))
all_topics = [topic.strip() for topics in df['topics'].dropna() for topic in topics.split(',')]
top_topics = pd.Series(Counter(all_topics)).sort_values(ascending=True)[-10:]
sns.barplot(y=top_topics.index, x=top_topics.values)
plt.title('Top 10 Question Topics')
plt.xlabel('Frequency')
plt.show()

# 5. Correlation between Number of Similar Questions and Heuristic Score
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='no_similar_questions', y='heuristic_score')
plt.title('Correlation: Similar Questions vs Compatibility Score')
plt.xlabel('Number of Similar Questions')
plt.ylabel('Heuristic Score')
plt.show()

# 6. Top Industries
plt.figure(figsize=(10, 6))
all_industries = [ind.strip() for industries in df['company_industry'].dropna() for ind in industries.split(',')]
top_industries = pd.Series(Counter(all_industries)).sort_values(ascending=True)[-10:]
sns.barplot(y=top_industries.index, x=top_industries.values)
plt.title('Top 10 Industries')
plt.xlabel('Frequency')
plt.show()

# 7. Difficulty Distribution by Level
plt.figure(figsize=(10, 6))
difficulty_level = pd.crosstab(df['difficulty'], df['level'])
difficulty_level.plot(kind='bar', stacked=True)
plt.title('Question Difficulty Distribution by Seniority Level')
plt.xlabel('Difficulty')
plt.ylabel('Count')
plt.legend(['Junior', 'Mid', 'Senior'])
plt.show()

# 8. Average Heuristic Score by Level
plt.figure(figsize=(10, 6))
level_scores = df.groupby('level')['heuristic_score'].mean()
sns.barplot(x=['Junior', 'Mid', 'Senior'], y=level_scores.values)
plt.title('Average Compatibility Score by Seniority Level')
plt.xlabel('Seniority Level')
plt.ylabel('Average Heuristic Score')
plt.show()

# Additional analysis for company and field insights
print("\nTop 5 Companies by Average Heuristic Score:")
company_scores = df.groupby('company_name')['heuristic_score'].mean().sort_values(ascending=False).head()
display(company_scores)

print("\nCorrelation between number of similar questions and heuristic score:")
correlation = df['no_similar_questions'].corr(df['heuristic_score'])
display(correlation)

explanation for the Open plots below 👇
1. Category Distribution: <br>
* Shows the proportion of questions between data science and general categories
* Helps understand the balance of technical vs. soft skills questions

2. Seniority Level Analysis: <br>
* Reveals how heuristic scores vary across different seniority levels
* Helps identify if certain levels have consistently higher compatibility scores

3. Skills Analysis: <br>
* Identifies the most frequently required skills across job postings
* Useful for understanding which skills are most in demand

4. Industry Insights: <br>
* Shows which industries have the most relevant questions
* Helps identify sectors with specific question patterns

5. Level-Score Correlation: <br>
* Demonstrates any relationship between seniority and question compatibility
* Useful for understanding if question complexity aligns with job level

6. Field Distribution: <br>
* Reveals the most common job fields in the dataset
* Helps understand which areas have the most specialized questions

7. Topic Analysis: <br>
* Shows the most common question topics
* Helps identify patterns in question content

8. Score Distribution: <br>
* Shows the overall distribution of heuristic scores
* Helps understand the general compatibility levels of questions

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.functions import explode, split, count

pdf = top_open_spark.toPandas()

# Set the style for all plots
plt.style.use('seaborn')

# 1. Distribution of Questions by Category
plt.figure(figsize=(10, 6))
category_counts = pdf['category'].value_counts()
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', 
        colors=['lightskyblue', 'deepskyblue'], 
        wedgeprops={'edgecolor': 'white'})
plt.title('Distribution of Questions by Category', pad=20, fontsize=14)
plt.show()

# 2. Average Heuristic Score by Level
plt.figure(figsize=(10, 6))
level_scores = pdf.groupby('level')['heuristic_score'].mean().sort_index()
seniority_colors = ['thistle', 'plum', 'orchid']
bars = plt.bar(level_scores.index, level_scores.values, color=seniority_colors, width=0.4)
plt.title('Average Heuristic Score by Seniority Level', pad=20, fontsize=14)
plt.xlabel('Seniority Level (0=Junior, 1=Mid, 2=Senior)')
plt.ylabel('Average Heuristic Score')
# Add value labels on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}',
             ha='center', va='bottom')
legend_elements = [plt.Rectangle((0,0),1,1, color=color) for color in seniority_colors]
plt.legend(legend_elements, ['Junior', 'Mid', 'Senior'], 
          title='Seniority Level', loc='upper right', bbox_to_anchor=(1.15, 1))
plt.grid(True, alpha=0.3)
plt.show()

# 3. Top 10 Required Skills
plt.figure(figsize=(10, 6))
all_skills = pdf['skills'].str.split(',').explode().str.strip()
top_skills = all_skills.value_counts().head(10)
sns.barplot(x=top_skills.values, y=top_skills.index, palette='RdYlBu')
plt.title('Top 10 Required Skills', pad=20, fontsize=14)
plt.xlabel('Count')
plt.tight_layout()
plt.show()

# 4. Distribution of Questions by Company Industry
plt.figure(figsize=(10, 6))
all_industries = pdf['company_industry'].str.split(r', and |, ').explode().str.strip()
all_industries = all_industries[all_industries != '-']
top_industries = all_industries.value_counts().head(10)
sns.barplot(x=top_industries.values, y=top_industries.index, palette='viridis')
plt.title('Top 10 Company Industries', pad=20, fontsize=14)
plt.xlabel('Count')
plt.tight_layout()
plt.show()

# 5. Correlation between Level and Heuristic Score
plt.figure(figsize=(10, 6))
plt.scatter(pdf['level'], pdf['heuristic_score'], color='#FF6B6B', alpha=0.6, s=100)
plt.title('Correlation: Seniority Level vs Heuristic Score', pad=20, fontsize=14)
plt.xlabel('Seniority Level')
plt.ylabel('Heuristic Score')
plt.grid(True, alpha=0.3)
plt.show()

# 6. Distribution of Questions by Field
plt.figure(figsize=(10, 6))
all_fields = pdf['field'].str.split(r', and |, ').explode().str.strip()
top_fields = all_fields.value_counts().head(10)
sns.barplot(x=top_fields.values, y=top_fields.index, palette='mako')
plt.title('Top 10 Job Fields', pad=20, fontsize=14)
plt.xlabel('Count')
plt.tight_layout()
plt.show()

# 7. Distribution of Topics
plt.figure(figsize=(10, 6))
all_topics = pdf['topics'].str.split(',').explode().str.strip()
top_topics = all_topics.value_counts().head(10)
sns.barplot(x=top_topics.values, y=top_topics.index, palette='husl')
plt.title('Top 10 Question Topics', pad=20, fontsize=14)
plt.xlabel('Count')
plt.tight_layout()
plt.show()

# 8. Heuristic Score Distribution
plt.figure(figsize=(10, 6))
sns.histplot(pdf['heuristic_score'], bins=20, color='pink', alpha=0.7)
plt.title('Distribution of Heuristic Scores', pad=20, fontsize=14)
plt.xlabel('Heuristic Score')
plt.ylabel('Count')
plt.grid(True, alpha=0.3)
plt.show()