In [1]:
import os
import MySQLdb
from dotenv import load_dotenv
import pandas as pd
load_dotenv()
import plotnine

def database_query(sql_query):
    try:
        connection = MySQLdb.connect(
        host=os.getenv("DATABASE_HOST"),
        user=os.getenv("DATABASE_USERNAME"),
        passwd=os.getenv("DATABASE_PASSWORD"),
        db=os.getenv("DATABASE"),
        autocommit=True,
        # ssl_mode="VERIFY_iDENTITY",
        ssl={"ca": "/etc/ssl/certs/ca-certificates.crt"})
    except:
        connection = MySQLdb.connect(
        host=os.environ["DATABASE_HOST"],
        user=os.environ["DATABASE_USERNAME"],
        passwd=os.environ["DATABASE_PASSWORD"],
        db=os.environ["DATABASE"],
        autocommit=True,
        # ssl_mode="VERIFY_iDENTITY",
        ssl={"ca": "/etc/ssl/certs/ca-certificates.crt"})
    try:
        c = connection.cursor()
        c.execute(sql_query)
        results = c.fetchall()
        return results
    except MySQLdb.Error as e:
        print("MySQL Error:", e)
    finally:
        c.close()
        connection.close()


sql_query = '''

select
    title
    ,department
    ,location
    ,salary
    ,closing_date
    ,uid
    ,scraped_date
from 
    all_time_listings

'''

df = pd.DataFrame(database_query(sql_query))
df.columns = ['title', 'department', 'location', 'salary', 'closing_date', 'uid', 'scraped_date']
df.head()

Unnamed: 0,title,department,location,salary,closing_date,uid,scraped_date
0,Senior Data Analyst - Corporate Performance,Companies House,"Cardiff, Wales, CF14 3UZ",46588,2024-03-05,153376,2024-02-21
1,Kitchen Steward,House of Commons,Westminster,24959,2024-05-19,200324,2024-01-04
2,National Professional Advisor (for People with...,Care Quality Commission,"East Midlands (England), East of England, Lond...",70000,2024-02-26,24711,2024-02-13
3,Commis Chef,House of Commons,Westminster,24959,2024-05-19,255755,2024-01-04
4,Recreation Works Supervisor,Forestry Commission,"Burley Office, Burley, Hampshire BH24 4HS",26534,2024-01-14,257501,2024-01-04


In [30]:
from plotnine import ggplot, scale_fill_gradient, scale_color_continuous, scale_color_gradient, aes, geom_point, geom_col, geom_line, geom_histogram, geom_boxplot, facet_wrap, theme, element_text, element_blank, element_rect, element_line, labs, scale_x_continuous, scale_y_continuous, scale_fill_manual, scale_color_manual, scale_linetype_manual, scale_shape_manual, scale_size_manual, scale_alpha_manual, coord_flip, coord_cartesian, coord_fixed

Plot showing median salaries by department

In [67]:
df['salary_int'] = df['salary'].str.replace(',','')
df['salary_int'] = df['salary_int'].astype(float)
df['scraped_date_date'] = pd.to_datetime(df['scraped_date'])
df['week_commencing'] = df['scraped_date_date'].dt.to_period('W').dt.start_time
df['salary_int'].describe()
daily_salary = df[['week_commencing','salary_int','department','location']].groupby(['week_commencing','salary_int','department','location']).mean().reset_index()
department_salary = df.groupby('department').agg({'salary_int':'median','uid':'count'}).reset_index().sort_values('salary_int', ascending=False)
department_salary.columns = ['department','salary_int','Number of Postings']

# department_salary = pd.concat([department_salary.nlargest(5, 'salary_int').reset_index(),department_salary.nsmallest(5, 'salary_int').reset_index()], axis=0, ignore_index=True).sort_values('salary_int', ascending=False)

plot = (
    ggplot(department_salary, aes(x="reorder(department, salary_int, ascending=False)", y='salary_int'
                                    ,color='Number of Postings', fill='Number of Postings'
                                   ))

    + geom_col()
    # + theme(legend_position='none')
    + scale_color_gradient(limits=[10, 100], labels=['<10', '>100'], breaks=[10, 100])
    + scale_fill_gradient(limits=[10, 100], labels=['<10', '>100'], breaks=[10, 100])
    + labs(title=f"Civil Services Departments Median Salary Comparison: {df['scraped_date_date'].dt.date.min()} to {df['scraped_date_date'].dt.date.max()}", x='Department', y='Median Salary (£)')
    + theme(axis_text_x=element_text(angle=90, face='bold'))
    + theme(axis_text_x=element_text(size=6))
    + theme(axis_text_y=element_text(size=6))
    + theme(axis_title=element_text(size=6))
    + theme(plot_title=element_text(size=20, face='bold'))
)

plot.save("department_salary.png", width=20, height=10, units='in', dpi=300)




Unnamed: 0,department,salary_int,Number of Postings
68,HM Prison & Probation Service,25752.0,1383
100,Ministry of Defence,35290.0,854
148,UK Health Security Agency,38724.0,408
75,Home Office,41600.0,356
101,Ministry of Justice,39868.0,314
36,Department for Work and Pensions,40201.0,277
65,HM Courts and Tribunals Service,27223.0,235
28,Department for Business and Trade,39384.0,233
7,Cabinet Office,40850.0,226
35,Department for Transport,40808.0,223


See if we can pull number of posts out of the data as well to improve salary averages?
Compare highest salaries with behvaiours to establish highest-value behaviours