## Most in-demand skills for the top 3 most popular data roles

In [4]:
# Importing Libraries
import ast
import pandas as pd
import seaborn as sns
from datasets import load_dataset
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go 
from plotly.subplots import make_subplots

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else(x))

In [5]:
df_Germany = df[df['job_country'] == 'Germany']

In [6]:
df_skills = df_Germany.explode('job_skills')

df_skills[['job_title', 'job_skills']]

Unnamed: 0,job_title,job_skills
2,"Data Engineer/Scientist/Analyst, Mid or Senior...",python
2,"Data Engineer/Scientist/Analyst, Mid or Senior...",sql
2,"Data Engineer/Scientist/Analyst, Mid or Senior...",c#
2,"Data Engineer/Scientist/Analyst, Mid or Senior...",azure
2,"Data Engineer/Scientist/Analyst, Mid or Senior...",airflow
...,...,...
785735,Senior Data Engineer,docker
785737,CRM Data Analyst,sas
785737,CRM Data Analyst,sas
785737,CRM Data Analyst,sql


In [14]:
df_skills_count = df_skills.groupby(['job_skills', 'job_title_short']).size()
df_skills_count
type(df_skills_count)

pandas.core.series.Series

In [16]:
df_skills_count.index.name = 'job_skills'
df_skills_count = df_skills_count.reset_index(name='skill_count')

In [18]:
df_skills_count.sort_values(by='skill_count', ascending=False, inplace=True)
df_skills_count

Unnamed: 0,index,job_skills,job_title_short,skill_count
907,907,python,Data Scientist,4157
906,906,python,Data Engineer,3524
1139,1139,sql,Data Engineer,3145
1138,1138,sql,Data Analyst,2947
905,905,python,Data Analyst,2309
...,...,...,...,...
254,254,django,Business Analyst,1
255,255,django,Cloud Engineer,1
256,256,django,Data Analyst,1
1127,1127,spring,Senior Data Scientist,1


In [20]:
job_titles = df_skills_count['job_title_short'].unique().tolist()
job_titles = sorted(job_titles[:3])
job_titles

['Data Analyst', 'Data Engineer', 'Data Scientist']

In [43]:
fig = make_subplots(
    rows=len(job_titles),
    cols=1,
    subplot_titles=job_titles,
    vertical_spacing=0.1
)

for i, job_title in enumerate(job_titles, 1):
    df_plot = df_skills_count[df_skills_count['job_title_short'] == job_title].head(5)

    fig.add_trace(
        go.Bar(
            y=df_plot['job_skills'],
            x=df_plot['skill_count'],
            orientation='h',
            name=job_title,
            text=df_plot['skill_count'],
            textposition='auto',
            marker_color=f'rgb({i*50}, {i*30}, {150})'
        ),
        row=i, col=1
    )

    fig.update_yaxes(autorange='reversed', row=i, col=1)

    fig.update_yaxes(title_text="", row=i, col=1)

    fig.update_traces(showlegend=False)

fig.update_layout(
    height=300 * len(job_titles),
    title_text = "Top Skills in Job Postings",
    title_font_size=20,
    showlegend=False,
    barmode='group'
)

fig.show()

In [46]:
df_job_title_count = df_Germany['job_title_short'].value_counts().reset_index(name='jobs_total')

In [57]:
df_skills_perc = pd.merge(df_skills_count, df_job_title_count, how='left', on='job_title_short')
df_skills_perc['skill_percent'] = (100 * df_skills_perc['skill_count'] / df_skills_perc['jobs_total']).round(2)
df_skills_perc

Unnamed: 0,index,job_skills,job_title_short,skill_count,jobs_total,skill_percent
0,907,python,Data Scientist,4157,6745,61.63
1,906,python,Data Engineer,3524,6675,52.79
2,1139,sql,Data Engineer,3145,6675,47.12
3,1138,sql,Data Analyst,2947,7131,41.33
4,905,python,Data Analyst,2309,7131,32.38
...,...,...,...,...,...,...
1377,254,django,Business Analyst,1,817,0.12
1378,255,django,Cloud Engineer,1,189,0.53
1379,256,django,Data Analyst,1,7131,0.01
1380,1127,spring,Senior Data Scientist,1,1737,0.06


In [75]:
fig = make_subplots(
    rows=len(job_titles),
    cols=1,
    subplot_titles=job_titles,
    vertical_spacing=0.1,
    shared_xaxes=True
)

for i, job_title in enumerate(job_titles, 1):
    df_plot = df_skills_perc[df_skills_perc['job_title_short'] == job_title].head(5)

    fig.add_trace(
        go.Bar(
            y=df_plot['job_skills'],
            x=df_plot['skill_percent'],
            orientation='h',
            name=job_title,
            text=df_plot['skill_percent'].apply(lambda x: f"{x:.1f}%"),
            textposition='auto',
            marker_color=f'rgb({i*50}, {i*30}, {150})',
            hovertemplate='<b>%{y}</b><br>Percentage: %{x:.1f}%<extra></extra>'
        ),
        row=i, col=1
    )

    fig.update_yaxes(autorange='reversed', row=i, col=1)
    fig.update_yaxes(title_text="", row=i, col=1)
    fig.update_xaxes(
        ticksuffix='%', 
        row=i, col=1
    )

fig.update_layout(
    height=300 * len(job_titles),
    title_text="Likelihood of Skills Requested in Germany Job Postings",
    title_font_size=20,
    showlegend=False,
    barmode='group'
)

fig.show()