In [93]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt  

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

# Apply() - Calculate Project Salary Next Year

Example 1
Calculate projected salaries next year, using an assumed rate of 3.0% for all roles.

In [94]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [95]:
df_salary = df[pd.notna(df['salary_year_avg'])]

def projected_salary(salary):
    return salary * 1.03

df_salary['salary_year_avg'].apply(projected_salary)

28        112785.00
77        144200.00
92        123600.00
100       235068.66
109        91670.00
            ...    
785624    143392.48
785641    154500.00
785648    228531.25
785682    162225.00
785692    162225.00
Name: salary_year_avg, Length: 22003, dtype: float64

compare to the original

In [96]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(salary):
    return salary * 1.03

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


writing it with an anonymus function

Example 2
Calculate projected salaries next year, but:

* For senior roles (e.g., Senior Data Analysts), assume the rate is 5%
* For all other roles, assume rate is 3%

In [97]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


Example 3

Convert the job_skills from a generic object to an actual list object (hint this is very important for later). Let's try doing that by just using ast.literal_eval and then look at our new column.

In [98]:
type(df['job_skills'][1])

str

Let's look at the literal_eval() function from the Python Standard Library ast module

In [99]:
import ast

ast.literal_eval(df['job_skills'][1])

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [100]:
type(ast.literal_eval(df['job_skills'][1]))

list

clean up

In [101]:
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list)

In [102]:
df['job_skills'][1]

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [103]:
type(df['job_skills'][1])

list

we get a lost back

make it a lambda function 

In [106]:
#df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literl_eval(skill_list)) if pd.notna(skill_list) else (skill_list)

were assume for senior roles they have an inflation rate of 5%

# Calculate Projected salary nect year

* Senior roles assume 5%
* Other roles assume 3%

In [107]:
df_salary['sa;ary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [111]:
def projected_salary(row):
    if "Sennior" in row['job_title_short']:
        return 1.05 * row['salary_year_avg']
    else:
        return 1.03 * row['salary_year_avg']

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


Lambda function

In [115]:
df_salary['salary_year_inflated'] = df_salary.apply(lambda row: 1.05 * row['salary_year_avg'] if row['job_title_short'] else 1.03 * row['salary_year_avg'], axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,114975.00
77,Data Engineer,140000.0,147000.00
92,Data Engineer,120000.0,126000.00
100,Data Scientist,228222.0,239633.10
109,Data Analyst,89000.0,93450.00
...,...,...,...
785624,Data Engineer,139216.0,146176.80
785641,Data Engineer,150000.0,157500.00
785648,Data Scientist,221875.0,232968.75
785682,Data Scientist,157500.0,165375.00
