In [3]:
# read pickle file

import pickle
import pandas as pd
import polars as pl
with open("../data/salary_guide.pkl", 'rb') as f:
     data = pickle.load(f)

# load to dataframe
df = pd.DataFrame(data)

Collecting polars
  Downloading polars-0.16.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: polars
Successfully installed polars-0.16.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


FileNotFoundError: [Errno 2] No such file or directory: '../data/salary_guide.pkl'

In [None]:
df

In [None]:
# Split the salary ranges into minimum and maximum salaries
df[['entry_min', 'entry_max']] = df['entry'].str.split('-', expand=True).astype(float)
df[['mid_min', 'mid_max']] = df['mid'].str.split('-', expand=True).astype(float)
df[['senior_min', 'senior_max']] = df['senior'].str.split('-', expand=True).astype(float)

# Create a new column with the maximum salary for each job
df['max_salary'] = df[['entry_max', 'mid_max', 'senior_max']].max(axis=1)
#calculate the mean for each level
df['entry_mean'] = df[['entry_min', 'entry_max']].mean(axis=1)
df['mid_mean'] = df[['mid_min', 'mid_max']].mean(axis=1)
df['senior_mean'] = df[['senior_min', 'senior_max']].mean(axis=1)

# Sort the DataFrame by the maximum salary in descending order
df_sorted = df.sort_values('max_salary', ascending=False)

# Get the job that pays the most
job_highest_paid = df_sorted['job'].iloc[0]

print(f"The job that pays the most is {job_highest_paid}")

In [None]:
df_sorted.head()

In [None]:
df_sorted.head(100)

### Get the engineering jobs that pay the most

In [None]:

top_engineers_jobs=df_sorted[df_sorted['job'].str.contains('engineer', case=False)]
top_engineers_jobs.head(30)

### get the engineers jobs that pay the least

In [None]:

worst_engineers_jobs=top_engineers_jobs.sort_values('max_salary', ascending=True)
worst_engineers_jobs.head(30)


### Get the job that pays the most in Quebec

In [None]:
top_quebec_jobs=df_sorted[df_sorted['province'].str.contains('québec', case=False)]
top_quebec_jobs.head(30)

In [None]:
#make a function to compare a specific job in a specific province
def compare_job_province(job, province):
    df_filtered = df[(df['province'] == province) & (df['job'] == job)]
    df_grouped = df_filtered.groupby(['job', 'region'])['mid_mean'].mean().reset_index()
    df_sorted = df_grouped.sort_values('mid_mean', ascending=False)
    return df_sorted


In [None]:
# "ontario", "québec", "british columbia", "alberta", "manitoba", "saskatchewan", "nova scotia", "new brunswick", "newfoundland & labrador", "prince edward island"
df_job_province = compare_job_province('cloud architect', 'québec')
df_job_province

## Across the country

In [None]:
#function to compare a specific job in a all provinces
def compare_job_all_provinces(job):
    df_filtered = df[(df['job'] == job)]
    df_grouped = df_filtered.groupby(['job', 'province'])['mid_mean'].mean().reset_index()
    df_sorted = df_grouped.sort_values('mid_mean', ascending=False)
    return df_sorted


In [None]:
df_job_all_provinces = compare_job_all_provinces('cloud architect')
df_job_all_provinces

### Detailled breakdown of the salary by province

In [None]:


# Get the unique provinces
provinces = df['province'].unique()

# Define the job to compare
job = 'cloud architect'

# Loop over the provinces and compare the job in each province
for province in provinces:
    # Filter the dataframe by province and job
    df_filtered = df[(df['province'] == province) & (df['job'] == job)]

    # Group the resulting dataframe by job and region, and calculate the mean mid salary
    df_grouped = df_filtered.groupby(['job', 'region'])['mid_mean'].mean().reset_index()

    # Sort the grouped dataframe by the mid salary in descending order
    df_sorted = df_grouped.sort_values('mid_mean', ascending=False)

    # Print the sorted dataframe with the province name
    print(f'{province.capitalize()}:')
    print(df_sorted)
    print()

# Part 2 

### Let's find the job that has the biggest difference across the country

In [None]:
#function to find the job with the largest difference, the user can choose between entry, mid or senior
def find_job_with_biggest_diff(level):
    #if level is not entry, mid or senior, return an error
    if level not in ['entry_mean', 'mid_mean', 'senior_mean']:
        return 'Error: level must be entry_mean, mid_mean or senior_mean'
    # Group by the 'job' column and find the difference between the maximum and minimum values of the 'mid' column for each group
    job_diffs = df.groupby('job')[[level]].apply(lambda x: (x.max() - x.min()).sum())

    # Find the job with the largest overall salary difference
    max_diff_job = job_diffs.idxmax()

    # Print the result
    print("The job with the largest overall salary difference is: {}".format(max_diff_job))


In [None]:
find_job_with_biggest_diff('entry_mean')

In [None]:
job_diffs = df.groupby('job')[["mid_mean"]].apply(lambda x: (x.max() - x.min()).sum())
job_diffs_df = job_diffs.to_frame('Difference')

In [None]:
#sort the dataframe by the difference
job_diffs_df_sorted = job_diffs_df.sort_values('Difference', ascending=False)   
job_diffs_df_sorted.head(30)