In [1]:
#Import dependencies
import pandas as pd
import requests

#API pull for latest salary info from www.levels.fyi
salaryData = requests.get('https://www.levels.fyi/js/salaryData.json').json()
salary_df = pd.DataFrame(salaryData)

In [2]:
#raw table preview
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,,7392,807,1
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100,"San Francisco, CA",5.0,3.0,,,,,,,7419,807,2
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310,"Seattle, WA",8.0,0.0,,155.0,,,,,11527,819,3
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,,11521,819,5
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,,1320,0,6
5,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372,"Sunnyvale, CA",7.0,5.0,,157.0,180.0,35.0,,,7472,807,7
6,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157,"Mountain View, CA",5.0,3.0,,,,,,,7322,807,9
7,6/20/2017 18:49:59,Amazon,L5,Software Engineer,190,"Seattle, WA",3.0,3.0,,110000.0,80000.0,,,,11527,819,10
8,6/21/2017 17:27:47,Microsoft,63,Software Engineer,208,"Seattle, WA",8.5,8.5,,,,,,,11527,819,11
9,6/22/2017 12:37:51,Microsoft,65,Software Engineering Manager,300,"Redmond, WA",15.0,11.0,,180.0,65.0,55.0,,,11521,819,12


In [3]:
#dropping columns that are not relevant to project
salary_df = salary_df.drop(['cityid', 'dmaid','rowNumber','otherdetails',], axis=1)
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100,"San Francisco, CA",5.0,3.0,,,,,
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310,"Seattle, WA",8.0,0.0,,155.0,,,
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,
5,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372,"Sunnyvale, CA",7.0,5.0,,157.0,180.0,35.0,
6,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157,"Mountain View, CA",5.0,3.0,,,,,
7,6/20/2017 18:49:59,Amazon,L5,Software Engineer,190,"Seattle, WA",3.0,3.0,,110000.0,80000.0,,
8,6/21/2017 17:27:47,Microsoft,63,Software Engineer,208,"Seattle, WA",8.5,8.5,,,,,
9,6/22/2017 12:37:51,Microsoft,65,Software Engineering Manager,300,"Redmond, WA",15.0,11.0,,180.0,65.0,55.0,


In [4]:
#converting to float to allow for summary stats
salary_df["totalyearlycompensation"] = pd.to_numeric(salary_df["totalyearlycompensation"])
salary_df["yearsofexperience"] = pd.to_numeric(salary_df["yearsofexperience"])
salary_df["yearsatcompany"] = pd.to_numeric(salary_df["yearsatcompany"])
salary_df["basesalary"] = pd.to_numeric(salary_df["basesalary"])
salary_df["stockgrantvalue"] = pd.to_numeric(salary_df["stockgrantvalue"])
salary_df["bonus"] = pd.to_numeric(salary_df["bonus"])

#coverting timestamp from object to datetime
salary_df['timestamp'] =  pd.to_datetime(salary_df['timestamp'], infer_datetime_format=True)

In [5]:
# Create separate cols for city, state and country
def split_location(location):
    items = location.split(', ')
    city = items[0]
    state = items[1]
    
    if len(items)==2:
        country = 'US'
    elif len(items)==3:
        country = items[2].strip()
    elif len(items)==4:
        country = ', '.join([i.strip() for i in items[2:]])
    else:
        country = None
        print(location)
        
    return [city, state, country]



In [6]:
salary_df['loc_items'] = salary_df.location.apply(lambda x: split_location(x))
salary_df['city'] = salary_df.loc_items.apply(lambda x: x[0])
salary_df['state'] = salary_df.loc_items.apply(lambda x: x[1])
salary_df['country'] = salary_df.loc_items.apply(lambda x: x[2])

In [7]:
salary_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,loc_items,city,state,country
0,2017-06-07 11:33:27,Oracle,L3,Product Manager,127.0,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,"[Redwood City, CA, US]",Redwood City,CA,US
1,2017-06-10 17:11:29,eBay,SE 2,Software Engineer,100.0,"San Francisco, CA",5.0,3.0,,,,,,"[San Francisco, CA, US]",San Francisco,CA,US
2,2017-06-11 14:53:57,Amazon,L7,Product Manager,310.0,"Seattle, WA",8.0,0.0,,155.0,,,,"[Seattle, WA, US]",Seattle,WA,US
3,2017-06-14 21:22:25,Microsoft,64,Software Engineering Manager,200.0,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,"[Redmond, WA, US]",Redmond,WA,US
4,2017-06-16 10:44:01,Amazon,L5,Software Engineer,173.0,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,"[Vancouver, BC, Canada]",Vancouver,BC,Canada


In [8]:
# droping location column  
salary_df = salary_df.drop(['location','loc_items'], axis=1)
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
0,2017-06-07 11:33:27,Oracle,L3,Product Manager,127.0,1.5,1.5,,107.0,20.0,10.0,,Redwood City,CA,US
1,2017-06-10 17:11:29,eBay,SE 2,Software Engineer,100.0,5.0,3.0,,,,,,San Francisco,CA,US
2,2017-06-11 14:53:57,Amazon,L7,Product Manager,310.0,8.0,0.0,,155.0,,,,Seattle,WA,US
3,2017-06-14 21:22:25,Microsoft,64,Software Engineering Manager,200.0,9.0,9.0,,169000.0,100000.0,30000.0,,Redmond,WA,US
4,2017-06-16 10:44:01,Amazon,L5,Software Engineer,173.0,11.0,1.0,,120000.0,0.0,53000.0,,Vancouver,BC,Canada
5,2017-06-17 00:23:14,Apple,M1,Software Engineering Manager,372.0,7.0,5.0,,157.0,180.0,35.0,,Sunnyvale,CA,US
6,2017-06-20 10:58:51,Microsoft,60,Software Engineer,157.0,5.0,3.0,,,,,,Mountain View,CA,US
7,2017-06-20 18:49:59,Amazon,L5,Software Engineer,190.0,3.0,3.0,,110000.0,80000.0,,,Seattle,WA,US
8,2017-06-21 17:27:47,Microsoft,63,Software Engineer,208.0,8.5,8.5,,,,,,Seattle,WA,US
9,2017-06-22 12:37:51,Microsoft,65,Software Engineering Manager,300.0,15.0,11.0,,180.0,65.0,55.0,,Redmond,WA,US


In [9]:
#isolating US data for further exploration
us_df = salary_df[salary_df.country=='US'].copy()

In [10]:
#isolating us data to data scientist titles
us_df = us_df[us_df.title=='Data Scientist'].copy()
us_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
745,2018-06-05 14:06:30,LinkedIn,Senior,Data Scientist,233.0,4.0,0.0,Data Analysis,162.0,220.0,10.0,Male,San Francisco,CA,US
772,2018-06-08 00:29:47,Amazon,L4,Data Scientist,140.0,2.0,2.0,,92000.0,48000.0,,Male,Seattle,WA,US
776,2018-06-08 09:49:25,Microsoft,64,Data Scientist,218.0,11.0,11.0,ML / AI,165.0,28.0,23.0,Male,Seattle,WA,US
782,2018-06-08 17:55:09,ebay,26,Data Scientist,180.0,10.0,5.0,,,,,Female,San Jose,CA,US
796,2018-06-10 19:39:35,Twitter,Staff,Data Scientist,500.0,4.0,4.0,ML / AI,200.0,280.0,20.0,Male,San Francisco,CA,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62569,2021-08-16 16:17:19,IBM,L5,Data Scientist,145.0,6.0,5.0,Data,140.0,0.0,5.0,Male,New City,NY,US
62578,2021-08-16 17:08:58,Booz Allen Hamilton,Senior Consultant,Data Scientist,110.0,0.0,0.0,General,110.0,0.0,0.0,Male,West McLean,VA,US
62600,2021-08-16 21:02:37,Xandr,L1,Data Scientist,120.0,1.0,0.0,General,110.0,0.0,10.0,Male,Portland,OR,US
62610,2021-08-16 22:19:48,Facebook,L4,Data Scientist,233.0,2.0,2.0,Data Engineering,157.0,60.0,16.0,Male,Menlo Park,CA,US


In [11]:
#exploring us/datascience/IL at amazon
apple_df = us_df[us_df.company=='Apple'].copy()
apple_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
1699,2018-09-07 00:23:35,Apple,ICT3,Data Scientist,215.0,0.0,0.0,ML / AI,125.0,38.0,52.0,,Cupertino,CA,US
2112,2018-09-28 00:58:15,Apple,ICT3,Data Scientist,188.0,6.0,1.0,ML / AI,125.0,50.0,14.0,,Cupertino,CA,US
2463,2018-10-10 01:30:48,Apple,ICT4,Data Scientist,322.0,6.0,2.0,ML / AI,176.0,,,,Cupertino,CA,US
2841,2018-10-31 17:22:31,Apple,ICT5,Data Scientist,425.0,9.0,1.0,ML / AI,200.0,150.0,75.0,,Sunnyvale,CA,US
3912,2018-12-21 14:29:13,Apple,ICT4,Data Scientist,300.0,6.0,2.0,Services,176.0,100.0,20.0,Male,Cupertino,CA,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57110,2021-07-01 12:30:34,Apple,ICT4,Data Scientist,225.0,8.0,8.0,General,136.0,81.0,8.0,Male,Cupertino,CA,US
58574,2021-07-15 06:57:45,Apple,ICT4,Data Scientist,223.0,8.0,5.0,Operations,158.0,45.0,20.0,Male,Austin,TX,US
59274,2021-07-21 13:32:13,Apple,ICT4,Data Scientist,248.0,3.0,0.0,Data Science,175.0,40.0,33.0,,Cupertino,CA,US
60992,2021-08-04 13:52:45,Apple,ICT4,Data Scientist,265.0,6.0,3.0,Data,150.0,100.0,15.0,Male,Cupertino,CA,US


In [12]:
#exploring targeted results for amazon levels reporting
apple_df.level.value_counts()

ICT4    33
ICT3    17
ICT5     7
L3       7
IC4      3
L4       2
4        1
e3       1
E4       1
3        1
ICT6     1
ict5     1
Name: level, dtype: int64

In [13]:
#Correcting ict5 to ICT5 as they have similar basesalary/bonus
apple_df.replace('ict5', 'ICT5', inplace=True)
apple_df.replace('IC4', 'ICT4', inplace=True)

In [15]:
#comparing medians for insights on compareable levels
apple_levels_df = apple_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
apple_levels_df

Unnamed: 0_level_0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,stockgrantvalue,bonus
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,230.0,136.0,1.0,4.0,80.0,16.0
4,252.0,180.0,5.0,0.0,45.0,27.0
E4,144.0,144.0,2.0,0.0,0.0,0.0
ICT3,212.0,150.0,4.0,1.0,35.0,15.0
ICT4,267.5,175.0,6.0,2.0,81.0,20.0
ICT5,390.5,194.0,11.5,3.5,152.5,29.0
ICT6,645.0,375.0,7.0,2.0,180.0,90.0
L3,213.0,148.0,3.0,1.0,50.0,14.0
L4,356.5,171.0,5.0,2.5,165.0,20.0
e3,480.0,480.0,20.0,4.0,0.0,0.0


In [16]:
#Correcting 4 to ICT4 as they have similar basesalary/bonus
apple_df.replace('4', 'ICT4', inplace=True)
#Correcting 3 to ICT3 as they have similar basesalary/bonus
apple_df.replace('3', 'ICT3', inplace=True)
#Correcting L3 to ICT3 as they have similar basesalary/bonus
apple_df.replace('L3', 'ICT3', inplace=True)
#Correcting L4 to ICT4 as they have similar basesalary/bonus
apple_df.replace('L4', 'ICT4', inplace=True)

In [17]:
#exploring targeted results for amazon levels reporting
apple_df.level.value_counts()

ICT4    39
ICT3    25
ICT5     8
E4       1
ICT6     1
e3       1
Name: level, dtype: int64

In [18]:
#comparing medians for insights on compareable levels
apple_levels_df = apple_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
apple_levels_df

Unnamed: 0_level_0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,stockgrantvalue,bonus
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
E4,144.0,144.0,2.0,0.0,0.0,0.0
ICT3,213.0,148.0,4.0,1.0,38.0,14.0
ICT4,265.0,175.0,6.0,2.0,80.5,20.0
ICT5,390.5,194.0,11.5,3.5,152.5,29.0
ICT6,645.0,375.0,7.0,2.0,180.0,90.0
e3,480.0,480.0,20.0,4.0,0.0,0.0


In [21]:
#dropping single value for e3 and E4 as the salary/stockgrantvalue/bonus are out of allignment with the other levels
#removed M1 and M2 both software engineer managers
apple_df = apple_df.drop(index=apple_df[apple_df['level'] == 'e3'].index)
apple_df = apple_df.drop(index=apple_df[apple_df['level'] == 'E4'].index)
#removed ICT6 due to high base salary and high stock value as it indictates upper manangement
apple_df = apple_df.drop(index=apple_df[apple_df['level'] == 'ICT6'].index)

In [22]:
apple_df.level.value_counts()

ICT4    39
ICT3    25
ICT5     8
Name: level, dtype: int64

In [24]:
#placing each level in our categories based on similiar: basesalary/yearsofexperience/yearsatcompany/stockgrantvalue/bonus
apple_df.loc[apple_df.level == "ICT3", "Category"] = "Entry"
apple_df.loc[apple_df.level == "ICT4", "Category"] = "Mid"
apple_df.loc[apple_df.level == "ICT5", "Category"] = "Senior"

In [25]:
apple_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country,Category
1699,2018-09-07 00:23:35,Apple,ICT3,Data Scientist,215.0,0.0,0.0,ML / AI,125.0,38.0,52.0,,Cupertino,CA,US,Entry
2112,2018-09-28 00:58:15,Apple,ICT3,Data Scientist,188.0,6.0,1.0,ML / AI,125.0,50.0,14.0,,Cupertino,CA,US,Entry
2463,2018-10-10 01:30:48,Apple,ICT4,Data Scientist,322.0,6.0,2.0,ML / AI,176.0,,,,Cupertino,CA,US,Mid
2841,2018-10-31 17:22:31,Apple,ICT5,Data Scientist,425.0,9.0,1.0,ML / AI,200.0,150.0,75.0,,Sunnyvale,CA,US,Senior
3912,2018-12-21 14:29:13,Apple,ICT4,Data Scientist,300.0,6.0,2.0,Services,176.0,100.0,20.0,Male,Cupertino,CA,US,Mid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57110,2021-07-01 12:30:34,Apple,ICT4,Data Scientist,225.0,8.0,8.0,General,136.0,81.0,8.0,Male,Cupertino,CA,US,Mid
58574,2021-07-15 06:57:45,Apple,ICT4,Data Scientist,223.0,8.0,5.0,Operations,158.0,45.0,20.0,Male,Austin,TX,US,Mid
59274,2021-07-21 13:32:13,Apple,ICT4,Data Scientist,248.0,3.0,0.0,Data Science,175.0,40.0,33.0,,Cupertino,CA,US,Mid
60992,2021-08-04 13:52:45,Apple,ICT4,Data Scientist,265.0,6.0,3.0,Data,150.0,100.0,15.0,Male,Cupertino,CA,US,Mid
