In [1]:
#Import dependencies
import pandas as pd
import requests

#API pull for latest salary info from www.levels.fyi
salaryData = requests.get('https://www.levels.fyi/js/salaryData.json').json()
salary_df = pd.DataFrame(salaryData)

In [2]:
#raw table preview
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,,7392,807,1
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100,"San Francisco, CA",5.0,3.0,,,,,,,7419,807,2
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310,"Seattle, WA",8.0,0.0,,155.0,,,,,11527,819,3
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,,11521,819,5
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,,1320,0,6
5,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372,"Sunnyvale, CA",7.0,5.0,,157.0,180.0,35.0,,,7472,807,7
6,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157,"Mountain View, CA",5.0,3.0,,,,,,,7322,807,9
7,6/20/2017 18:49:59,Amazon,L5,Software Engineer,190,"Seattle, WA",3.0,3.0,,110000.0,80000.0,,,,11527,819,10
8,6/21/2017 17:27:47,Microsoft,63,Software Engineer,208,"Seattle, WA",8.5,8.5,,,,,,,11527,819,11
9,6/22/2017 12:37:51,Microsoft,65,Software Engineering Manager,300,"Redmond, WA",15.0,11.0,,180.0,65.0,55.0,,,11521,819,12


In [3]:
#dropping columns that are not relevant to project
salary_df = salary_df.drop(['cityid', 'dmaid','rowNumber','otherdetails',], axis=1)
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100,"San Francisco, CA",5.0,3.0,,,,,
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310,"Seattle, WA",8.0,0.0,,155.0,,,
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,
5,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372,"Sunnyvale, CA",7.0,5.0,,157.0,180.0,35.0,
6,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157,"Mountain View, CA",5.0,3.0,,,,,
7,6/20/2017 18:49:59,Amazon,L5,Software Engineer,190,"Seattle, WA",3.0,3.0,,110000.0,80000.0,,
8,6/21/2017 17:27:47,Microsoft,63,Software Engineer,208,"Seattle, WA",8.5,8.5,,,,,
9,6/22/2017 12:37:51,Microsoft,65,Software Engineering Manager,300,"Redmond, WA",15.0,11.0,,180.0,65.0,55.0,


In [4]:
#converting to float to allow for summary stats
salary_df["totalyearlycompensation"] = pd.to_numeric(salary_df["totalyearlycompensation"])
salary_df["yearsofexperience"] = pd.to_numeric(salary_df["yearsofexperience"])
salary_df["yearsatcompany"] = pd.to_numeric(salary_df["yearsatcompany"])
salary_df["basesalary"] = pd.to_numeric(salary_df["basesalary"])
salary_df["stockgrantvalue"] = pd.to_numeric(salary_df["stockgrantvalue"])
salary_df["bonus"] = pd.to_numeric(salary_df["bonus"])

#coverting timestamp from object to datetime
salary_df['timestamp'] =  pd.to_datetime(salary_df['timestamp'], infer_datetime_format=True)

In [5]:
# Create separate cols for city, state and country
def split_location(location):
    items = location.split(', ')
    city = items[0]
    state = items[1]
    
    if len(items)==2:
        country = 'US'
    elif len(items)==3:
        country = items[2].strip()
    elif len(items)==4:
        country = ', '.join([i.strip() for i in items[2:]])
    else:
        country = None
        print(location)
        
    return [city, state, country]



In [6]:
salary_df['loc_items'] = salary_df.location.apply(lambda x: split_location(x))
salary_df['city'] = salary_df.loc_items.apply(lambda x: x[0])
salary_df['state'] = salary_df.loc_items.apply(lambda x: x[1])
salary_df['country'] = salary_df.loc_items.apply(lambda x: x[2])

In [7]:
salary_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,loc_items,city,state,country
0,2017-06-07 11:33:27,Oracle,L3,Product Manager,127.0,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,"[Redwood City, CA, US]",Redwood City,CA,US
1,2017-06-10 17:11:29,eBay,SE 2,Software Engineer,100.0,"San Francisco, CA",5.0,3.0,,,,,,"[San Francisco, CA, US]",San Francisco,CA,US
2,2017-06-11 14:53:57,Amazon,L7,Product Manager,310.0,"Seattle, WA",8.0,0.0,,155.0,,,,"[Seattle, WA, US]",Seattle,WA,US
3,2017-06-14 21:22:25,Microsoft,64,Software Engineering Manager,200.0,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,"[Redmond, WA, US]",Redmond,WA,US
4,2017-06-16 10:44:01,Amazon,L5,Software Engineer,173.0,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,"[Vancouver, BC, Canada]",Vancouver,BC,Canada


In [8]:
# droping location column  
salary_df = salary_df.drop(['location','loc_items'], axis=1)
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
0,2017-06-07 11:33:27,Oracle,L3,Product Manager,127.0,1.5,1.5,,107.0,20.0,10.0,,Redwood City,CA,US
1,2017-06-10 17:11:29,eBay,SE 2,Software Engineer,100.0,5.0,3.0,,,,,,San Francisco,CA,US
2,2017-06-11 14:53:57,Amazon,L7,Product Manager,310.0,8.0,0.0,,155.0,,,,Seattle,WA,US
3,2017-06-14 21:22:25,Microsoft,64,Software Engineering Manager,200.0,9.0,9.0,,169000.0,100000.0,30000.0,,Redmond,WA,US
4,2017-06-16 10:44:01,Amazon,L5,Software Engineer,173.0,11.0,1.0,,120000.0,0.0,53000.0,,Vancouver,BC,Canada
5,2017-06-17 00:23:14,Apple,M1,Software Engineering Manager,372.0,7.0,5.0,,157.0,180.0,35.0,,Sunnyvale,CA,US
6,2017-06-20 10:58:51,Microsoft,60,Software Engineer,157.0,5.0,3.0,,,,,,Mountain View,CA,US
7,2017-06-20 18:49:59,Amazon,L5,Software Engineer,190.0,3.0,3.0,,110000.0,80000.0,,,Seattle,WA,US
8,2017-06-21 17:27:47,Microsoft,63,Software Engineer,208.0,8.5,8.5,,,,,,Seattle,WA,US
9,2017-06-22 12:37:51,Microsoft,65,Software Engineering Manager,300.0,15.0,11.0,,180.0,65.0,55.0,,Redmond,WA,US


In [9]:
#isolating US data for further exploration
us_df = salary_df[salary_df.country=='US'].copy()

In [13]:
#isolating us data to data scientist titles
us_df = us_df[us_df.title=='Data Scientist'].copy()
us_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
745,2018-06-05 14:06:30,LinkedIn,Senior,Data Scientist,233.0,4.0,0.0,Data Analysis,162.0,220.0,10.0,Male,San Francisco,CA,US
772,2018-06-08 00:29:47,Amazon,L4,Data Scientist,140.0,2.0,2.0,,92000.0,48000.0,,Male,Seattle,WA,US
776,2018-06-08 09:49:25,Microsoft,64,Data Scientist,218.0,11.0,11.0,ML / AI,165.0,28.0,23.0,Male,Seattle,WA,US
782,2018-06-08 17:55:09,ebay,26,Data Scientist,180.0,10.0,5.0,,,,,Female,San Jose,CA,US
796,2018-06-10 19:39:35,Twitter,Staff,Data Scientist,500.0,4.0,4.0,ML / AI,200.0,280.0,20.0,Male,San Francisco,CA,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62569,2021-08-16 16:17:19,IBM,L5,Data Scientist,145.0,6.0,5.0,Data,140.0,0.0,5.0,Male,New City,NY,US
62578,2021-08-16 17:08:58,Booz Allen Hamilton,Senior Consultant,Data Scientist,110.0,0.0,0.0,General,110.0,0.0,0.0,Male,West McLean,VA,US
62600,2021-08-16 21:02:37,Xandr,L1,Data Scientist,120.0,1.0,0.0,General,110.0,0.0,10.0,Male,Portland,OR,US
62610,2021-08-16 22:19:48,Facebook,L4,Data Scientist,233.0,2.0,2.0,Data Engineering,157.0,60.0,16.0,Male,Menlo Park,CA,US


In [14]:
#exploring us/datascience/IL at amazon
Micro_df = us_df[us_df.company=='Microsoft'].copy()
Micro_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
776,2018-06-08 09:49:25,Microsoft,64,Data Scientist,218.0,11.0,11.0,ML / AI,165.0,28.0,23.0,Male,Seattle,WA,US
876,2018-06-20 00:47:43,Microsoft,65,Data Scientist,340.0,11.0,11.0,ML / AI,200.0,80.0,60.0,Male,Bellevue,WA,US
1302,2018-08-15 12:23:35,Microsoft,67,Data Scientist,500.0,20.0,3.0,ML / AI,,,,,Seattle,WA,US
1315,2018-08-15 15:38:02,Microsoft,62,Data Scientist,220.0,2.0,2.0,ML / AI,137.0,67.0,15.0,,Redmond,WA,US
1379,2018-08-17 01:25:50,Microsoft,65,Data Scientist,340.0,9.0,2.0,ML / AI,203.0,96.0,41.0,,Redmond,WA,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61201,2021-08-05 15:05:29,Microsoft,66,Data Scientist,757.0,5.0,0.0,"ML, AI",255.0,420.0,82.0,,Mountain View,CA,US
61294,2021-08-06 09:13:11,Microsoft,60,Data Scientist,162.0,0.0,0.0,NLP,120.0,30.0,12.0,Male,Redmond,WA,US
62091,2021-08-12 18:34:15,Microsoft,61,Data Scientist,205.0,3.0,3.0,ML,130.0,60.0,15.0,Male,Boston,MA,US
62252,2021-08-13 16:39:04,Microsoft,64,Data Scientist,230.0,13.0,4.0,"Identity, Cloud",180.0,20.0,30.0,,Seattle,WA,US


In [15]:
#exploring targeted results for amazon levels reporting
Micro_df.level.value_counts()

62              51
61              36
63              34
64              21
60              20
65              13
59               6
66               6
L60              2
67               2
L62              2
L64              1
Principal EM     1
L61              1
Principal PM     1
SDE II           1
L5               1
Name: level, dtype: int64

In [16]:
#normalize formating inconsistency in levels
Micro_df.replace('L60', '60', inplace=True)
Micro_df.replace('L62', '62', inplace=True)
Micro_df.replace('L64', '64', inplace=True)
Micro_df.replace('L61', '61', inplace=True)

In [17]:
Micro_df.level.value_counts()

62              53
61              37
63              34
60              22
64              22
65              13
59               6
66               6
67               2
SDE II           1
Principal EM     1
Principal PM     1
L5               1
Name: level, dtype: int64

In [18]:
#comparing medians for insights on compareable levels
Micro_levels_df =  Micro_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
Micro_levels_df

Unnamed: 0_level_0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,stockgrantvalue,bonus
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
59,148.0,109.0,0.5,0.0,30.0,20.0
60,169.5,120.0,1.0,0.1,33.0,15.0
61,178.0,131.0,2.0,1.0,35.0,15.0
62,190.0,143.0,4.0,2.0,30.0,15.0
63,222.0,159.5,5.0,3.0,37.5,22.0
64,249.0,179.5,10.0,4.5,30.0,30.0
65,320.0,200.0,10.0,6.0,60.0,41.0
66,377.5,225.0,15.0,1.5,95.0,52.5
67,506.5,230.0,16.0,3.0,225.0,58.0
L5,280.0,187.0,5.0,5.0,72.0,21.0


In [22]:
#removing references to SDE as software development engineers are outisde our focus
Micro_df = Micro_df.drop(index=Micro_df[Micro_df['level'] == 'SDE II'].index)
#removed Principal EM as they are Software engineer manager
Micro_df = Micro_df.drop(index=Micro_df[Micro_df['level'] == 'Principal EM'].index)
#removed Principal PM as they are product manager
Micro_df = Micro_df.drop(index=Micro_df[Micro_df['level'] == 'Principal PM'].index)
#removed 66 and 67 as high stock grant value points to upper managment
Micro_df = Micro_df.drop(index=Micro_df[Micro_df['level'] == '66'].index)
Micro_df = Micro_df.drop(index=Micro_df[Micro_df['level'] == '67'].index)
#removed L5 as that is not a format for levels at microsoft nor is their values inline with other levels
Micro_df = Micro_df.drop(index=Micro_df[Micro_df['level'] == 'L5'].index)

In [23]:
Micro_df.level.value_counts()

62    53
61    37
63    34
60    22
64    22
65    13
59     6
Name: level, dtype: int64

In [26]:
#placing each level in our categories based on similiar: basesalary/yearsofexperience/yearsatcompany/stockgrantvalue/bonus
Micro_df.loc[Micro_df.level == "59", "Category"] = "Entry"
Micro_df.loc[Micro_df.level == "60", "Category"] = "Entry"
Micro_df.loc[Micro_df.level == "61", "Category"] = "Entry"
Micro_df.loc[Micro_df.level == "62", "Category"] = "Mid"
Micro_df.loc[Micro_df.level == "63", "Category"] = "Mid"
Micro_df.loc[Micro_df.level == "64", "Category"] = "Senior"
Micro_df.loc[Micro_df.level == "65", "Category"] = "Senior"

In [27]:
Micro_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country,Category
776,2018-06-08 09:49:25,Microsoft,64,Data Scientist,218.0,11.0,11.0,ML / AI,165.0,28.0,23.0,Male,Seattle,WA,US,Senior
876,2018-06-20 00:47:43,Microsoft,65,Data Scientist,340.0,11.0,11.0,ML / AI,200.0,80.0,60.0,Male,Bellevue,WA,US,Senior
1315,2018-08-15 15:38:02,Microsoft,62,Data Scientist,220.0,2.0,2.0,ML / AI,137.0,67.0,15.0,,Redmond,WA,US,Mid
1379,2018-08-17 01:25:50,Microsoft,65,Data Scientist,340.0,9.0,2.0,ML / AI,203.0,96.0,41.0,,Redmond,WA,US,Senior
1552,2018-08-27 12:34:51,Microsoft,62,Data Scientist,149.0,9.0,1.0,ML / AI,,,,,Redmond,WA,US,Mid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59293,2021-07-21 15:52:02,Microsoft,62,Data Scientist,225.0,2.0,2.0,machine learning,155.0,50.0,20.0,Male,Seattle,WA,US,Mid
61294,2021-08-06 09:13:11,Microsoft,60,Data Scientist,162.0,0.0,0.0,NLP,120.0,30.0,12.0,Male,Redmond,WA,US,Entry
62091,2021-08-12 18:34:15,Microsoft,61,Data Scientist,205.0,3.0,3.0,ML,130.0,60.0,15.0,Male,Boston,MA,US,Entry
62252,2021-08-13 16:39:04,Microsoft,64,Data Scientist,230.0,13.0,4.0,"Identity, Cloud",180.0,20.0,30.0,,Seattle,WA,US,Senior
