# apply() - Caluclate Projected Salary Next Year

.apply() is a function used to apply another function to elements in a pandas DataFrame or Series.

Think of it like this:

“Hey pandas, take this function and use it on every row or every column (or even every single value).”

In [24]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [25]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']  # using not na we filtered out the column then we specified it to print values

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [26]:
# Tryna find the projected salary for next year. We are going to apply what inflation is now which is around 3%.

help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None, args=(), by_row: "Literal[False, 'compat']" = 'compat', engine: "Literal['python', 'numba']" = 'python', engine_kwargs: 'dict[str, bool] | None' = None, **kwargs) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.

    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.

    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:

        

In [27]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()  # to not make any major chnages in the original df we use copy.

def projected_salary(salary):  # we created a func to define a value which works like a variable that returns salary * 1.03
    return salary * 1.03

df_salary['salary_year_avg'].apply(projected_salary) # since directly using df will show NaN values too.


# to check is it really changed for all the values we should do the following.

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']] # since pulling data from tow cols of the table we need to specify it under double brackets.

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [28]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03) 
# rather than using define to use multiple lines of code we used apply func with lambda and assigned salary as salary * 1.03 and we will get the same result.

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [29]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.03
# We could've directly multiply the data in the df to get the same result.


df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [30]:
type(df['job_skills'][1]) # since the value in this cols is not list but a str, since it is hard to simplify it as each letter is defined separately we need to convert it to a list.

df['job_skills'][1]

"['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']"

In [31]:
list(df['job_skills'][1]) # previously we could use this to convert as list but here it is giving us a jumbled mess.

['[',
 "'",
 'r',
 "'",
 ',',
 ' ',
 "'",
 'p',
 'y',
 't',
 'h',
 'o',
 'n',
 "'",
 ',',
 ' ',
 "'",
 's',
 'q',
 'l',
 "'",
 ',',
 ' ',
 "'",
 'n',
 'o',
 's',
 'q',
 'l',
 "'",
 ',',
 ' ',
 "'",
 'p',
 'o',
 'w',
 'e',
 'r',
 ' ',
 'b',
 'i',
 "'",
 ',',
 ' ',
 "'",
 't',
 'a',
 'b',
 'l',
 'e',
 'a',
 'u',
 "'",
 ']']

In [32]:
# since this is an edge case we have to use ast_eval func which will convert these single items into a list.
# First we need to import ast since it is a separate module in python standard library.

import ast

ast.literal_eval(df['job_skills'][1])

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [33]:
type(ast.literal_eval(df['job_skills'][1])) # to check which type the info is in the df.

list

In [34]:
def clean_list(skill_list):
    if pd.notna(skill_list):           # had to do this since there are particular jobs which do not request for any skills. And when running the code it will give us error with the values.
        return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list) # can't use skill_list since it is a value. Need to use the func and use apply to another func in one func.


In [35]:
type(df['job_skills'][1])

list

In [45]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import ast

dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [46]:
# Another method is to use lambda (had to import data again since it wasn't letting me work on the df)

df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)

In [47]:
type(df['job_skills'][1])

list

# Calculate projected salary next year

* Senior roles assume 5%
* Other roles assume 3%

In [50]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import ast

dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [51]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

df_salary['salary_year_inflated'] = df['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [53]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import ast

dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [58]:
# since the data has inflation rate differing based on role, we need to define a func and then use apply func.

df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(row):   # we are passing a row to check each data and make sure if the job_title_short is not senior then we need to request for it to run salary * 1.03
    if "Senior" in row['job_title_short']: # if we encounter a job_title_short having Senior present in the str we will run the following code.
        return 1.05 * row['salary_year_avg']
    else:
        return 1.03 * row['salary_year_avg']
    

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)  # we don't need other data hence we are applying to the entire DataFrame to get the projected salary in index 1 as 0 is for row numbers.

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


Using lambda we can do following codes:

In [None]:
df_salary['salary_year_inflated'] = df_salary.apply(lambda row: 1.05 * row['salary_year_avg'] if "Senior" in row['job_title_short'] else 1.03 * row['salary_year_avg'], axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00
