## apply( ) - Calculate Project Salary Next Year

In [3]:
# Importing Libraries
import pandas as pd 
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleaning 
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [4]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [5]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [6]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

In [7]:
def projected_salary(salary):
    return salary * 1.03

df_salary['salary_year_avg'].apply(projected_salary)

28        112785.00
77        144200.00
92        123600.00
100       235068.66
109        91670.00
            ...    
785624    143392.48
785641    154500.00
785648    228531.25
785682    162225.00
785692    162225.00
Name: salary_year_avg, Length: 22003, dtype: float64

In [8]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

In [9]:
df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


.apply( ) Method with lambda Function

In [10]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [None]:
#A simpler way to  creates a new column with salaries that are 3% higher than the original ones

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.03

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,112785.00,116168.5500
77,144200.00,148526.0000
92,123600.00,127308.0000
100,235068.66,242120.7198
109,91670.00,94420.1000
...,...,...
785624,143392.48,147694.2544
785641,154500.00,159135.0000
785648,228531.25,235387.1875
785682,162225.00,167091.7500


## Valid Use Case for .apply( )

In [23]:
(df['job_skills'][1])

"['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']"

In [20]:
type(df['job_skills'][1])

str

In [25]:
import ast 

ast.literal_eval(df['job_skills'][1])

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [26]:
type(ast.literal_eval(df['job_skills'][1]))

list

In [None]:
# Theis gonna give a value error ( that i didn't pass the stuff correctly) that's why we have to use the .apply( ) mehtod for this  
# we can't run the ast.literal_eval on an entire column

df['job_skills'] = ast.literal_eval(df['job_skills'])

ValueError: malformed node or string: 0                                                      None
1         ['r', 'python', 'sql', 'nosql', 'power bi', 't...
2         ['python', 'sql', 'c#', 'azure', 'airflow', 'd...
3         ['python', 'c++', 'java', 'matlab', 'aws', 'te...
4         ['bash', 'python', 'oracle', 'aws', 'ansible',...
                                ...                        
785736    ['bash', 'python', 'perl', 'linux', 'unix', 'k...
785737                       ['sas', 'sas', 'sql', 'excel']
785738                              ['powerpoint', 'excel']
785739    ['python', 'go', 'nosql', 'sql', 'mongo', 'she...
785740                                      ['aws', 'flow']
Name: job_skills, Length: 785741, dtype: object

In [29]:
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)
df['job_skills'] = df['job_skills'].apply(clean_list)

In [31]:
df['job_skills'][1]

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [None]:
# here we can do what we did previously with lambda function (it's gonna give an eroor because we're already convertd the job_skills column into list)👆

df['job_skills'] = df['job_skills'].apply(
    lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list
)


## Calculating Projected Salary next year
* Senior roles assume 5% 
* Other roles assume 3%

In [None]:
df_salary['salary_year_inflated'] = df_salary['salary_year+avg'].apply(lambda salary: salary * 1.03)

In [37]:
def projected_salary(row):
    if "Senior" in row['job_title_short']:
        return 1.05 * row['salary_year_avg']
    else:
        return 1.03 * row['salary_year_avg']


df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)
df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,112785.00,116168.5500
77,Data Engineer,144200.00,148526.0000
92,Data Engineer,123600.00,127308.0000
100,Data Scientist,235068.66,242120.7198
109,Data Analyst,91670.00,94420.1000
...,...,...,...
785624,Data Engineer,143392.48,147694.2544
785641,Data Engineer,154500.00,159135.0000
785648,Data Scientist,228531.25,235387.1875
785682,Data Scientist,162225.00,167091.7500
