In [1]:
#Import dependencies
import pandas as pd
import requests

#API pull for latest salary info from www.levels.fyi
salaryData = requests.get('https://www.levels.fyi/js/salaryData.json').json()
salary_df = pd.DataFrame(salaryData)

In [2]:
#raw table preview
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,,7392,807,1
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100,"San Francisco, CA",5.0,3.0,,,,,,,7419,807,2
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310,"Seattle, WA",8.0,0.0,,155.0,,,,,11527,819,3
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,,11521,819,5
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,,1320,0,6
5,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372,"Sunnyvale, CA",7.0,5.0,,157.0,180.0,35.0,,,7472,807,7
6,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157,"Mountain View, CA",5.0,3.0,,,,,,,7322,807,9
7,6/20/2017 18:49:59,Amazon,L5,Software Engineer,190,"Seattle, WA",3.0,3.0,,110000.0,80000.0,,,,11527,819,10
8,6/21/2017 17:27:47,Microsoft,63,Software Engineer,208,"Seattle, WA",8.5,8.5,,,,,,,11527,819,11
9,6/22/2017 12:37:51,Microsoft,65,Software Engineering Manager,300,"Redmond, WA",15.0,11.0,,180.0,65.0,55.0,,,11521,819,12


In [3]:
#dropping columns that are not relevant to project
salary_df = salary_df.drop(['cityid', 'dmaid','rowNumber','otherdetails',], axis=1)
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100,"San Francisco, CA",5.0,3.0,,,,,
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310,"Seattle, WA",8.0,0.0,,155.0,,,
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,
5,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372,"Sunnyvale, CA",7.0,5.0,,157.0,180.0,35.0,
6,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157,"Mountain View, CA",5.0,3.0,,,,,
7,6/20/2017 18:49:59,Amazon,L5,Software Engineer,190,"Seattle, WA",3.0,3.0,,110000.0,80000.0,,
8,6/21/2017 17:27:47,Microsoft,63,Software Engineer,208,"Seattle, WA",8.5,8.5,,,,,
9,6/22/2017 12:37:51,Microsoft,65,Software Engineering Manager,300,"Redmond, WA",15.0,11.0,,180.0,65.0,55.0,


In [4]:
#converting to float to allow for summary stats
salary_df["totalyearlycompensation"] = pd.to_numeric(salary_df["totalyearlycompensation"])
salary_df["yearsofexperience"] = pd.to_numeric(salary_df["yearsofexperience"])
salary_df["yearsatcompany"] = pd.to_numeric(salary_df["yearsatcompany"])
salary_df["basesalary"] = pd.to_numeric(salary_df["basesalary"])
salary_df["stockgrantvalue"] = pd.to_numeric(salary_df["stockgrantvalue"])
salary_df["bonus"] = pd.to_numeric(salary_df["bonus"])

#coverting timestamp from object to datetime
salary_df['timestamp'] =  pd.to_datetime(salary_df['timestamp'], infer_datetime_format=True)

In [5]:
# Create separate cols for city, state and country
def split_location(location):
    items = location.split(', ')
    city = items[0]
    state = items[1]
    
    if len(items)==2:
        country = 'US'
    elif len(items)==3:
        country = items[2].strip()
    elif len(items)==4:
        country = ', '.join([i.strip() for i in items[2:]])
    else:
        country = None
        print(location)
        
    return [city, state, country]



In [6]:
salary_df['loc_items'] = salary_df.location.apply(lambda x: split_location(x))
salary_df['city'] = salary_df.loc_items.apply(lambda x: x[0])
salary_df['state'] = salary_df.loc_items.apply(lambda x: x[1])
salary_df['country'] = salary_df.loc_items.apply(lambda x: x[2])

In [7]:
salary_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,loc_items,city,state,country
0,2017-06-07 11:33:27,Oracle,L3,Product Manager,127.0,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,"[Redwood City, CA, US]",Redwood City,CA,US
1,2017-06-10 17:11:29,eBay,SE 2,Software Engineer,100.0,"San Francisco, CA",5.0,3.0,,,,,,"[San Francisco, CA, US]",San Francisco,CA,US
2,2017-06-11 14:53:57,Amazon,L7,Product Manager,310.0,"Seattle, WA",8.0,0.0,,155.0,,,,"[Seattle, WA, US]",Seattle,WA,US
3,2017-06-14 21:22:25,Microsoft,64,Software Engineering Manager,200.0,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,"[Redmond, WA, US]",Redmond,WA,US
4,2017-06-16 10:44:01,Amazon,L5,Software Engineer,173.0,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,"[Vancouver, BC, Canada]",Vancouver,BC,Canada


In [8]:
# droping location column  
salary_df = salary_df.drop(['location','loc_items'], axis=1)
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
0,2017-06-07 11:33:27,Oracle,L3,Product Manager,127.0,1.5,1.5,,107.0,20.0,10.0,,Redwood City,CA,US
1,2017-06-10 17:11:29,eBay,SE 2,Software Engineer,100.0,5.0,3.0,,,,,,San Francisco,CA,US
2,2017-06-11 14:53:57,Amazon,L7,Product Manager,310.0,8.0,0.0,,155.0,,,,Seattle,WA,US
3,2017-06-14 21:22:25,Microsoft,64,Software Engineering Manager,200.0,9.0,9.0,,169000.0,100000.0,30000.0,,Redmond,WA,US
4,2017-06-16 10:44:01,Amazon,L5,Software Engineer,173.0,11.0,1.0,,120000.0,0.0,53000.0,,Vancouver,BC,Canada
5,2017-06-17 00:23:14,Apple,M1,Software Engineering Manager,372.0,7.0,5.0,,157.0,180.0,35.0,,Sunnyvale,CA,US
6,2017-06-20 10:58:51,Microsoft,60,Software Engineer,157.0,5.0,3.0,,,,,,Mountain View,CA,US
7,2017-06-20 18:49:59,Amazon,L5,Software Engineer,190.0,3.0,3.0,,110000.0,80000.0,,,Seattle,WA,US
8,2017-06-21 17:27:47,Microsoft,63,Software Engineer,208.0,8.5,8.5,,,,,,Seattle,WA,US
9,2017-06-22 12:37:51,Microsoft,65,Software Engineering Manager,300.0,15.0,11.0,,180.0,65.0,55.0,,Redmond,WA,US


In [9]:
#isolating US data for further exploration
us_df = salary_df[salary_df.country=='US'].copy()

In [10]:
#isolating us data to data scientist titles
us_df = us_df[us_df.title=='Data Scientist'].copy()
us_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
745,2018-06-05 14:06:30,LinkedIn,Senior,Data Scientist,233.0,4.0,0.0,Data Analysis,162.0,220.0,10.0,Male,San Francisco,CA,US
772,2018-06-08 00:29:47,Amazon,L4,Data Scientist,140.0,2.0,2.0,,92000.0,48000.0,,Male,Seattle,WA,US
776,2018-06-08 09:49:25,Microsoft,64,Data Scientist,218.0,11.0,11.0,ML / AI,165.0,28.0,23.0,Male,Seattle,WA,US
782,2018-06-08 17:55:09,ebay,26,Data Scientist,180.0,10.0,5.0,,,,,Female,San Jose,CA,US
796,2018-06-10 19:39:35,Twitter,Staff,Data Scientist,500.0,4.0,4.0,ML / AI,200.0,280.0,20.0,Male,San Francisco,CA,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62569,2021-08-16 16:17:19,IBM,L5,Data Scientist,145.0,6.0,5.0,Data,140.0,0.0,5.0,Male,New City,NY,US
62578,2021-08-16 17:08:58,Booz Allen Hamilton,Senior Consultant,Data Scientist,110.0,0.0,0.0,General,110.0,0.0,0.0,Male,West McLean,VA,US
62600,2021-08-16 21:02:37,Xandr,L1,Data Scientist,120.0,1.0,0.0,General,110.0,0.0,10.0,Male,Portland,OR,US
62610,2021-08-16 22:19:48,Facebook,L4,Data Scientist,233.0,2.0,2.0,Data Engineering,157.0,60.0,16.0,Male,Menlo Park,CA,US


In [11]:
#exploring us/datascience/IL at amazon
AMZN_df = us_df[us_df.company=='Amazon'].copy()
AMZN_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
772,2018-06-08 00:29:47,Amazon,L4,Data Scientist,140.0,2.0,2.0,,92000.0,48000.0,,Male,Seattle,WA,US
862,2018-06-17 19:02:50,Amazon,L5,Data Scientist,200.0,3.0,0.0,ML / AI,150.0,105.0,81.0,Male,Seattle,WA,US
1159,2018-08-04 20:22:00,Amazon,L4,Data Scientist,130.0,7.0,1.0,ML / AI,,,,,Seattle,WA,US
1297,2018-08-15 06:10:00,Amazon,L,Data Scientist,200.0,5.0,1.0,Machine Learning,,,,,Boston,MA,US
1851,2018-09-16 14:00:08,Amazon,L5,Data Scientist,260.0,3.0,1.0,ML / AI,160.0,100.0,,,Palo Alto,CA,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61951,2021-08-11 16:46:41,Amazon,L5,Data Scientist,193.0,9.0,2.0,General,153.0,40.0,0.0,Male,Seattle,WA,US
61993,2021-08-12 03:44:42,Amazon,L5,Data Scientist,267.0,10.0,2.0,General,157.0,75.0,35.0,,Seattle,WA,US
62042,2021-08-12 12:18:11,Amazon,L6,Data Scientist,250.0,8.0,5.0,ML,128.0,122.0,0.0,Female,Seattle,WA,US
62089,2021-08-12 18:01:18,Amazon,L5,Data Scientist,200.0,3.0,1.0,AWS,147.0,43.0,10.0,,Boston,MA,US


In [12]:
#exploring targeted results for amazon levels reporting
AMZN_df.level.value_counts()

L5            127
L4             37
L6             35
L7              6
5               3
l6              2
4               2
L3              1
SDE III         1
IC5             1
L 4             1
L1              1
L               1
Senior SDE      1
Intern          1
SDE II          1
Senior          1
Name: level, dtype: int64

In [13]:
#normalize formating inconsistency in levels
AMZN_df.replace('5', 'L5', inplace=True)
AMZN_df.replace('4', 'L4', inplace=True)
AMZN_df.replace('l6', 'L6', inplace=True)
AMZN_df.replace('L 4', 'L4', inplace=True)

In [14]:
AMZN_df.level.value_counts()

L5            130
L4             40
L6             37
L7              6
L1              1
Senior SDE      1
IC5             1
L3              1
SDE III         1
Senior          1
L               1
Intern          1
SDE II          1
Name: level, dtype: int64

In [15]:
#comparing medians for insights on compareable levels
Amzn_levels_df =  AMZN_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
Amzn_levels_df

Unnamed: 0_level_0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,stockgrantvalue,bonus
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IC5,215.0,150.0,12.0,0.0,50.0,15.0
Intern,180.0,180.0,1.0,0.0,0.0,0.0
L,200.0,,5.0,1.0,,
L1,126.0,100.0,1.0,1.0,16.0,10.0
L3,123.0,105.0,0.0,1.0,8.0,10.0
L4,161.0,135.0,2.0,0.0,16.0,20.0
L5,215.0,150.0,5.0,1.0,47.5,22.0
L6,283.0,157.0,8.0,4.0,120.0,0.0
L7,570.0,162.5,12.5,2.5,377.5,0.0
SDE II,300.0,150.0,2.0,2.0,120.0,30.0


In [16]:
#assigning IC5 to L5 as value is inline with L5 averages
AMZN_df.replace('IC5', 'L5', inplace=True)
#removing references to SDE as software development engineers are outisde our focus
#removed one off Senior with high experience but outisde the medians for basesalary and stockgrant
AMZN_df = AMZN_df.drop(index=AMZN_df[AMZN_df['level'] == 'L'].index)
AMZN_df = AMZN_df.drop(index=AMZN_df[AMZN_df['level'] == 'SDE II'].index)
AMZN_df = AMZN_df.drop(index=AMZN_df[AMZN_df['level'] == 'Senior SDE'].index)
AMZN_df = AMZN_df.drop(index=AMZN_df[AMZN_df['level'] == 'SDE III'].index)
AMZN_df = AMZN_df.drop(index=AMZN_df[AMZN_df['level'] == 'Senior'].index)
#removed L7 as high stock grant value points to upper managment
AMZN_df = AMZN_df.drop(index=AMZN_df[AMZN_df['level'] == 'L7'].index)
#removed Intern
AMZN_df = AMZN_df.drop(index=AMZN_df[AMZN_df['level'] == 'Intern'].index)
#merging L3 and L1 datapoints as they are inline with stock grant values
AMZN_df.replace('L3', 'L4', inplace=True)
AMZN_df.replace('L1', 'L4', inplace=True)

In [17]:
AMZN_df.level.value_counts()

L5    131
L4     42
L6     37
Name: level, dtype: int64

In [18]:
AMZN_df.loc[AMZN_df.level == "L4", "Category"] = "Entry"
AMZN_df.loc[AMZN_df.level == "L5", "Category"] = "Mid"
AMZN_df.loc[AMZN_df.level == "L6", "Category"] = "Senior"

In [19]:
AMZN_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country,Category
772,2018-06-08 00:29:47,Amazon,L4,Data Scientist,140.0,2.0,2.0,,92000.0,48000.0,,Male,Seattle,WA,US,Entry
862,2018-06-17 19:02:50,Amazon,L5,Data Scientist,200.0,3.0,0.0,ML / AI,150.0,105.0,81.0,Male,Seattle,WA,US,Mid
1159,2018-08-04 20:22:00,Amazon,L4,Data Scientist,130.0,7.0,1.0,ML / AI,,,,,Seattle,WA,US,Entry
1851,2018-09-16 14:00:08,Amazon,L5,Data Scientist,260.0,3.0,1.0,ML / AI,160.0,100.0,,,Palo Alto,CA,US,Mid
2053,2018-09-27 02:58:43,Amazon,L5,Data Scientist,115.0,6.0,2.0,ML / AI,,,,,Seattle,WA,US,Mid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61951,2021-08-11 16:46:41,Amazon,L5,Data Scientist,193.0,9.0,2.0,General,153.0,40.0,0.0,Male,Seattle,WA,US,Mid
61993,2021-08-12 03:44:42,Amazon,L5,Data Scientist,267.0,10.0,2.0,General,157.0,75.0,35.0,,Seattle,WA,US,Mid
62042,2021-08-12 12:18:11,Amazon,L6,Data Scientist,250.0,8.0,5.0,ML,128.0,122.0,0.0,Female,Seattle,WA,US,Senior
62089,2021-08-12 18:01:18,Amazon,L5,Data Scientist,200.0,3.0,1.0,AWS,147.0,43.0,10.0,,Boston,MA,US,Mid
