Imports

In [31]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

Load the data from the merged jobs file

In [32]:
# Load Data
jobs = pd.read_csv('../data/merged/merged_data_jobs.csv')

The filter Widgets for user to select the filters on the visual graphs and charts!

In [33]:
# Filter Widgets

top_n_input = widgets.IntText(
    value=10,
    description='Top N:',
    layout=widgets.Layout(width='30%')
)

industry_dd = widgets.Dropdown(
    options=['All'] + sorted(jobs['Industry'].unique()),
    description='Industry:',
    layout=widgets.Layout(width='40%')
)

company_dd = widgets.Dropdown(
    options=['All'] + sorted(jobs['Company Name'].unique()),
    description='Company:',
    layout=widgets.Layout(width='40%')
)


country_dd = widgets.Dropdown(
    options= ['All'] + sorted(jobs['Country'].dropna().astype(str).unique()),
    description='Country:',
    layout=widgets.Layout(width='40%')
)

state_dd = widgets.Dropdown(
    options=['All'] + sorted(jobs['State'].dropna().astype(str).unique()),
    description='State:',
    layout=widgets.Layout(width='40%')
)

city_dd = widgets.Dropdown(
    options=['All'] + sorted(jobs['City'].dropna().astype(str).unique()),
    description='City:',
    layout=widgets.Layout(width='40%')
)

exp_cat_dd = widgets.Dropdown(
    options=['All'] + sorted(jobs['Experience Category'].unique()),
    description='Exp. Level:',
    layout=widgets.Layout(width='40%')
)

experience_slider = widgets.IntSlider(
    value=5,
    min=0,
    max=20,
    step=1,
    description='Experience ≤',
    layout=widgets.Layout(width='60%')
)


The helper functions to plot the graphs of different types

In [35]:
def plot_pie(data, labels_col, values_col, title='', colors='pastel', figsize=(7, 7)):
    """
    Generic pie chart plotter.

    Parameters:
    - data: DataFrame with the data to plot.
    - labels_col: Column name for pie slice labels.
    - values_col: Column name for values.
    - title: Chart title.
    - colors: Color palette.
    - figsize: Tuple for figure size.
    """
    plt.figure(figsize=figsize)
    plt.pie(
        data[values_col],
        labels=data[labels_col],
        autopct='%1.1f%%',
        colors=sns.color_palette(colors),
        startangle=140
    )
    plt.title(title)
    plt.tight_layout()
    plt.show()

def plot_bar(data, x, y, title='', xlabel='', ylabel='', rotation=90, palette='pastel', figsize=(10, 5)):
    """
    Generic bar chart plotter.

    Parameters:
    - data: DataFrame with the data to plot.
    - x: Column name for x-axis.
    - y: Column name for y-axis.
    - title: Chart title.
    - xlabel: Label for x-axis.
    - ylabel: Label for y-axis.
    - rotation: Rotation angle for x-axis labels.
    - palette: Seaborn color palette.
    - figsize: Tuple for figure size.
    """
    plt.figure(figsize=figsize)
    sns.barplot(data=data, x=x, y=y, palette=palette)
    plt.title(title)
    plt.xlabel(xlabel or x)
    plt.ylabel(ylabel or y)
    plt.xticks(rotation=rotation)
    plt.tight_layout()
    plt.show()


The methods to build the different visualizations

In [47]:
# 1. Number of Jobs vs Industry
def jobs_vs_industry(jobs, top_n=10):
    industry_counts = jobs['Industry'].value_counts().head(top_n).reset_index()
    industry_counts.columns = ['Industry', 'Job Count']
    plot_bar(industry_counts, 'Job Count', 'Industry', 'Top {} Industries by Number of jobs'.format(top_n), 'Number of Jobs', 'Industry')

# 2. Number of Jobs vs Country
def jobs_vs_country(jobs, top_n=10):
    country_counts = jobs['Country'].value_counts().head(top_n).reset_index()
    country_counts.columns = ['Country', 'Job Count']
    plot_bar(country_counts, 'Country', 'Job Count', 'Top Countries by Number of jobs', 'Country', 'Number of Jobs')

# 3. Number of Jobs vs States (with divisions of cities)
def plot_stacked_city_state(df, top_n=10):
    # Top N states with highest job counts
    top_states = df['State'].value_counts().nlargest(top_n).index
    df_filtered = df[df['State'].isin(top_states)]

    # Group and pivot the data
    grouped = (
        df_filtered.groupby(['State', 'City'])
        .size()
        .reset_index(name='Job Count')
    )

    pivot_table = grouped.pivot(index='State', columns='City', values='Job Count').fillna(0)

    # Plot
    pivot_table.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='tab20')
    plt.title("Stacked Bar Chart: City-wise Job Count within Top {} States".format(top_n))
    plt.xlabel("State")
    plt.ylabel("Number of Jobs")
    plt.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

def jobs_vs_city(jobs, top_n_states=5, top_n_cities=10):
    """
    Uses plot_bar to show number of jobs for top N cities in top N states.
    """
    # Filter top states
    top_states = jobs['State'].value_counts().nlargest(top_n_states).index
    df_filtered = jobs[jobs['State'].isin(top_states)]

    # Group by State and City
    grouped = df_filtered.groupby(['State', 'City']).size().reset_index(name='Job Count')

    # Optional: Filter to top N cities across those states
    top_cities = grouped.groupby('City')['Job Count'].sum().nlargest(top_n_cities).index
    grouped = grouped[grouped['City'].isin(top_cities)]

    # Create a combined label for clarity (e.g., "City (State)")
    grouped['City_State'] = grouped['City'] + " (" + grouped['State'] + ")"

    # Sort by job count for visual clarity
    grouped = grouped.sort_values(by='Job Count', ascending=False)

    # Use helper function
    plot_bar(
        data=grouped,
        x='City_State',
        y='Job Count',
        title=f"Top {top_n_cities} Cities in Top {top_n_states} States by Job Count",
        xlabel="City (State)",
        ylabel="Number of Jobs",
        rotation=90
    )

def show_industy_percentage(df, selected_industry, industry_col='Industry'):
    total_count = df.shape[0]
    selected_count = df[df[industry_col] == selected_industry].shape[0]
    other_count = total_count - selected_count

    colors = ['#ff9999', '#66b3ff']
    pie_data = pd.DataFrame({
        'Category': [selected_industry, 'Others'],
        'Count': [selected_count, other_count]
    })

    plot_pie(
        data=pie_data,
        labels_col='Category',
        values_col='Count',
        title=f"Share of '{selected_industry}' Industry",
        colors = colors
    )


The method which calls the different methods based on the user inputs

In [51]:
def plot_analytics(top_n, country, state, city, industry, exp_category, exp):
    df = jobs.copy()

    if country != 'All':
        df = df[df['Country'] == country]
    if state != 'All':
        df = df[df['State'] == state]
    if city != 'All':
        df = df[df['City'] == city]
    if industry != 'All':
        df = df[df['Industry'] == industry]
    if exp_category != 'All':
        df = df[df['Experience Category'] == exp_category & df['Experience (In Years)'] >= exp]

    if df.empty:
        print("No data available for selected filters.")
        return

    #print("Jobs vs Industry")
    if industry == 'All':
        jobs_vs_industry(df, top_n)
    else:
        # We can show the % share of selected industry in a pie chart
        show_industy_percentage(jobs, industry)
    #print("Jobs vs City")
    jobs_vs_city(df) #city
    #print("Jobs vs Country")
    jobs_vs_country(df, top_n) #country
    #print("Jobs vs States")
    plot_stacked_city_state(df) #state


   # plt.tight_layout()
   # plt.show()

The output or main cell to execute and display the visualizations!

In [53]:
# INTERACTIVE OUTPUT ------------------
ui = widgets.VBox([
    widgets.HBox([top_n_input]),
    widgets.HBox([industry_dd, company_dd]),
    widgets.HBox([city_dd, country_dd, state_dd]),
    widgets.HBox([exp_cat_dd, experience_slider])
])

out = widgets.interactive_output(plot_analytics, {
    'top_n': top_n_input,
    'country': country_dd,
    'state': state_dd,
    'city': city_dd,
    'industry': industry_dd,
    'exp_category': exp_cat_dd,
    'exp': experience_slider
})

display(ui, out)

VBox(children=(HBox(children=(IntText(value=15, description='Top N:', layout=Layout(width='30%')),)), HBox(chi…

Output()