In [None]:
import os
import MySQLdb
from dotenv import load_dotenv
import pandas as pd
load_dotenv()

def database_query(sql_query):
    try:
        connection = MySQLdb.connect(
        host=os.getenv("DATABASE_HOST"),
        user=os.getenv("DATABASE_USERNAME"),
        passwd=os.getenv("DATABASE_PASSWORD"),
        db=os.getenv("DATABASE"),
        autocommit=True,
        # ssl_mode="VERIFY_iDENTITY",
        ssl={"ca": "/etc/ssl/certs/ca-certificates.crt"})
    except:
        connection = MySQLdb.connect(
        host=os.environ["DATABASE_HOST"],
        user=os.environ["DATABASE_USERNAME"],
        passwd=os.environ["DATABASE_PASSWORD"],
        db=os.environ["DATABASE"],
        autocommit=True,
        # ssl_mode="VERIFY_iDENTITY",
        ssl={"ca": "/etc/ssl/certs/ca-certificates.crt"})
    try:
        c = connection.cursor()
        c.execute(sql_query)
        results = c.fetchall()
        return results
    except MySQLdb.Error as e:
        print("MySQL Error:", e)
    finally:
        c.close()
        connection.close()


sql_query = '''

select
    title
    ,department
    ,location
    ,salary
    ,closing_date
    ,uid
    ,scraped_date
from 
    all_time_listings

'''

df = pd.DataFrame(database_query(sql_query))
df.columns = ['title', 'department', 'location', 'salary', 'closing_date', 'uid', 'scraped_date']
df.head()

In [None]:
from plotnine import ggplot, scale_fill_gradient, scale_color_continuous, scale_color_gradient, aes, geom_point, geom_col, geom_line, geom_histogram, geom_boxplot, facet_wrap, theme, element_text, element_blank, element_rect, element_line, labs, scale_x_continuous, scale_y_continuous, scale_fill_manual, scale_color_manual, scale_linetype_manual, scale_shape_manual, scale_size_manual, scale_alpha_manual, coord_flip, coord_cartesian, coord_fixed

Plot showing median salaries by department

In [None]:
df['salary_int'] = df['salary'].str.replace(',','')
df['salary_int'] = df['salary_int'].astype(float)
df['scraped_date_date'] = pd.to_datetime(df['scraped_date'])
df['week_commencing'] = df['scraped_date_date'].dt.to_period('W').dt.start_time
df['salary_int'].describe()
daily_salary = df[['week_commencing','salary_int','department','location']].groupby(['week_commencing','salary_int','department','location']).mean().reset_index()
department_salary = df.groupby('department').agg({'salary_int':'median','uid':'count'}).reset_index().sort_values('salary_int', ascending=False)
department_salary.columns = ['department','salary_int','Number of Postings']

# department_salary = pd.concat([department_salary.nlargest(5, 'salary_int').reset_index(),department_salary.nsmallest(5, 'salary_int').reset_index()], axis=0, ignore_index=True).sort_values('salary_int', ascending=False)

plot = (
    ggplot(department_salary, aes(x="reorder(department, salary_int, ascending=False)", y='salary_int'
                                    ,color='Number of Postings', fill='Number of Postings'
                                   ))

    + geom_col()
    # + theme(legend_position='none')
    + scale_color_gradient(limits=[10, 100], labels=['<10', '>100'], breaks=[10, 100])
    + scale_fill_gradient(limits=[10, 100], labels=['<10', '>100'], breaks=[10, 100])
    + labs(title=f"Civil Service Departments Median Salary Comparison: {df['scraped_date_date'].dt.date.min()} to {df['scraped_date_date'].dt.date.max()}"
           , x='Department'
           , y='Median Salary (£)'
           , caption='Source: Civil Service Jobs (https://www.civilservicejobs.service.gov.uk/)')
    + theme(axis_text_x=element_text(angle=90, face='bold'))
    + theme(axis_text_x=element_text(size=6))
    + theme(axis_text_y=element_text(size=6))
    + theme(plot_caption=element_text(size=6, face='italic'))
    + theme(plot_title=element_text(size=20, face='bold'))
)

plot.save("department_salary.png", width=20, height=10, units='in', dpi=300)


In [None]:
##department median salary QA

pass_qa = True
number_of_departments = 0
def qa_check(df, department):
    global pass_qa
    global number_of_departments
    department_median = df[df['department']==department]['salary_int'].values[0]
    df_department = df[df['department']==department]
    difference = department_median - df_department['salary_int'].describe().values[5]
    if difference > 0:
        print(f"Variation in QA check for {department}")
        pass_qa = False
    else:
        number_of_departments += 1


for i in df['department'].unique():
    qa_check(department_salary, i)
if pass_qa:
    print("QA checks passed. Number of departments checked: ", number_of_departments)
else:
    print("QA checks failed")

See if we can pull number of posts out of the data as well to improve salary averages?
Compare highest salaries with behvaiours to establish highest-value behaviours

In [None]:
application_df = pd.DataFrame(database_query('select * from all_time_ad_qualities'))

In [None]:
application_df.columns = [
        'uid', 
        'developing_self_and_others', 
        'leadership',
        'making_effective_decisions',
        'seeing_the_big_picture',
        'managing_a_quality_service',
        'working_together',
        'communicating_and_influencing',
        'changing_and_improving',
        'delivering_at_pace',
        'apply_at_advertisers_site',
        'cv',
        'personal_statement',
        'reference_request',
        'application_form',
        'cover_letter',
        'presentation',
        'interview',
        'portfolio',
        'test']

In [None]:
df['uid'] = df['uid'].astype(int)
application_df['uid'] = application_df['uid'].astype(int)
combined_df = pd.merge(df, application_df, on='uid', how='inner')
combined_df.head()