In [1]:
#Import dependencies
import pandas as pd
import requests

#API pull for latest salary info from www.levels.fyi
salaryData = requests.get('https://www.levels.fyi/js/salaryData.json').json()
salary_df = pd.DataFrame(salaryData)

In [2]:
#raw table preview
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,,7392,807,1
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100,"San Francisco, CA",5.0,3.0,,,,,,,7419,807,2
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310,"Seattle, WA",8.0,0.0,,155.0,,,,,11527,819,3
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,,11521,819,5
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,,1320,0,6
5,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372,"Sunnyvale, CA",7.0,5.0,,157.0,180.0,35.0,,,7472,807,7
6,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157,"Mountain View, CA",5.0,3.0,,,,,,,7322,807,9
7,6/20/2017 18:49:59,Amazon,L5,Software Engineer,190,"Seattle, WA",3.0,3.0,,110000.0,80000.0,,,,11527,819,10
8,6/21/2017 17:27:47,Microsoft,63,Software Engineer,208,"Seattle, WA",8.5,8.5,,,,,,,11527,819,11
9,6/22/2017 12:37:51,Microsoft,65,Software Engineering Manager,300,"Redmond, WA",15.0,11.0,,180.0,65.0,55.0,,,11521,819,12


In [3]:
#dropping columns that are not relevant to project
salary_df = salary_df.drop(['cityid', 'dmaid','rowNumber','otherdetails',], axis=1)
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100,"San Francisco, CA",5.0,3.0,,,,,
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310,"Seattle, WA",8.0,0.0,,155.0,,,
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,
5,6/17/2017 0:23:14,Apple,M1,Software Engineering Manager,372,"Sunnyvale, CA",7.0,5.0,,157.0,180.0,35.0,
6,6/20/2017 10:58:51,Microsoft,60,Software Engineer,157,"Mountain View, CA",5.0,3.0,,,,,
7,6/20/2017 18:49:59,Amazon,L5,Software Engineer,190,"Seattle, WA",3.0,3.0,,110000.0,80000.0,,
8,6/21/2017 17:27:47,Microsoft,63,Software Engineer,208,"Seattle, WA",8.5,8.5,,,,,
9,6/22/2017 12:37:51,Microsoft,65,Software Engineering Manager,300,"Redmond, WA",15.0,11.0,,180.0,65.0,55.0,


In [4]:
#converting to float to allow for summary stats
salary_df["totalyearlycompensation"] = pd.to_numeric(salary_df["totalyearlycompensation"])
salary_df["yearsofexperience"] = pd.to_numeric(salary_df["yearsofexperience"])
salary_df["yearsatcompany"] = pd.to_numeric(salary_df["yearsatcompany"])
salary_df["basesalary"] = pd.to_numeric(salary_df["basesalary"])
salary_df["stockgrantvalue"] = pd.to_numeric(salary_df["stockgrantvalue"])
salary_df["bonus"] = pd.to_numeric(salary_df["bonus"])

#coverting timestamp from object to datetime
salary_df['timestamp'] =  pd.to_datetime(salary_df['timestamp'], infer_datetime_format=True)

In [5]:
# Create separate cols for city, state and country
def split_location(location):
    items = location.split(', ')
    city = items[0]
    state = items[1]
    
    if len(items)==2:
        country = 'US'
    elif len(items)==3:
        country = items[2].strip()
    elif len(items)==4:
        country = ', '.join([i.strip() for i in items[2:]])
    else:
        country = None
        print(location)
        
    return [city, state, country]



In [6]:
salary_df['loc_items'] = salary_df.location.apply(lambda x: split_location(x))
salary_df['city'] = salary_df.loc_items.apply(lambda x: x[0])
salary_df['state'] = salary_df.loc_items.apply(lambda x: x[1])
salary_df['country'] = salary_df.loc_items.apply(lambda x: x[2])

In [7]:
salary_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,loc_items,city,state,country
0,2017-06-07 11:33:27,Oracle,L3,Product Manager,127.0,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,"[Redwood City, CA, US]",Redwood City,CA,US
1,2017-06-10 17:11:29,eBay,SE 2,Software Engineer,100.0,"San Francisco, CA",5.0,3.0,,,,,,"[San Francisco, CA, US]",San Francisco,CA,US
2,2017-06-11 14:53:57,Amazon,L7,Product Manager,310.0,"Seattle, WA",8.0,0.0,,155.0,,,,"[Seattle, WA, US]",Seattle,WA,US
3,2017-06-14 21:22:25,Microsoft,64,Software Engineering Manager,200.0,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,"[Redmond, WA, US]",Redmond,WA,US
4,2017-06-16 10:44:01,Amazon,L5,Software Engineer,173.0,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,"[Vancouver, BC, Canada]",Vancouver,BC,Canada


In [8]:
# droping location column  
salary_df = salary_df.drop(['location','loc_items'], axis=1)
salary_df.head(10)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
0,2017-06-07 11:33:27,Oracle,L3,Product Manager,127.0,1.5,1.5,,107.0,20.0,10.0,,Redwood City,CA,US
1,2017-06-10 17:11:29,eBay,SE 2,Software Engineer,100.0,5.0,3.0,,,,,,San Francisco,CA,US
2,2017-06-11 14:53:57,Amazon,L7,Product Manager,310.0,8.0,0.0,,155.0,,,,Seattle,WA,US
3,2017-06-14 21:22:25,Microsoft,64,Software Engineering Manager,200.0,9.0,9.0,,169000.0,100000.0,30000.0,,Redmond,WA,US
4,2017-06-16 10:44:01,Amazon,L5,Software Engineer,173.0,11.0,1.0,,120000.0,0.0,53000.0,,Vancouver,BC,Canada
5,2017-06-17 00:23:14,Apple,M1,Software Engineering Manager,372.0,7.0,5.0,,157.0,180.0,35.0,,Sunnyvale,CA,US
6,2017-06-20 10:58:51,Microsoft,60,Software Engineer,157.0,5.0,3.0,,,,,,Mountain View,CA,US
7,2017-06-20 18:49:59,Amazon,L5,Software Engineer,190.0,3.0,3.0,,110000.0,80000.0,,,Seattle,WA,US
8,2017-06-21 17:27:47,Microsoft,63,Software Engineer,208.0,8.5,8.5,,,,,,Seattle,WA,US
9,2017-06-22 12:37:51,Microsoft,65,Software Engineering Manager,300.0,15.0,11.0,,180.0,65.0,55.0,,Redmond,WA,US


In [9]:
#isolating US data for further exploration
us_df = salary_df[salary_df.country=='US'].copy()

In [10]:
#isolating us data to data scientist titles
us_df = us_df[us_df.title=='Data Scientist'].copy()
us_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
745,2018-06-05 14:06:30,LinkedIn,Senior,Data Scientist,233.0,4.0,0.0,Data Analysis,162.0,220.0,10.0,Male,San Francisco,CA,US
772,2018-06-08 00:29:47,Amazon,L4,Data Scientist,140.0,2.0,2.0,,92000.0,48000.0,,Male,Seattle,WA,US
776,2018-06-08 09:49:25,Microsoft,64,Data Scientist,218.0,11.0,11.0,ML / AI,165.0,28.0,23.0,Male,Seattle,WA,US
782,2018-06-08 17:55:09,ebay,26,Data Scientist,180.0,10.0,5.0,,,,,Female,San Jose,CA,US
796,2018-06-10 19:39:35,Twitter,Staff,Data Scientist,500.0,4.0,4.0,ML / AI,200.0,280.0,20.0,Male,San Francisco,CA,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62569,2021-08-16 16:17:19,IBM,L5,Data Scientist,145.0,6.0,5.0,Data,140.0,0.0,5.0,Male,New City,NY,US
62578,2021-08-16 17:08:58,Booz Allen Hamilton,Senior Consultant,Data Scientist,110.0,0.0,0.0,General,110.0,0.0,0.0,Male,West McLean,VA,US
62600,2021-08-16 21:02:37,Xandr,L1,Data Scientist,120.0,1.0,0.0,General,110.0,0.0,10.0,Male,Portland,OR,US
62610,2021-08-16 22:19:48,Facebook,L4,Data Scientist,233.0,2.0,2.0,Data Engineering,157.0,60.0,16.0,Male,Menlo Park,CA,US


In [11]:
#exploring us/datascience/IL at amazon
fb_df = us_df[us_df.company=='Facebook'].copy()
fb_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country
858,2018-06-17 11:39:38,Facebook,5,Data Scientist,370.0,8.0,3.00,,190.0,140.0,40.0,Male,Seattle,WA,US
971,2018-07-05 13:43:39,Facebook,IC5,Data Scientist,225.0,11.0,6.00,,155.0,,,,Menlo Park,CA,US
1666,2018-09-06 00:50:18,Facebook,IC4,Data Scientist,200.0,5.0,0.25,Infrastructure,150.0,35.0,15.0,,Menlo Park,CA,US
1864,2018-09-17 15:37:28,Facebook,L5,Data Scientist,240.0,10.0,1.00,Full Stack,171.0,45.0,24.0,,Seattle,WA,US
2766,2018-10-28 16:36:57,Facebook,IC3,Data Scientist,193.0,1.0,0.00,ML / AI,,,,,San Francisco,CA,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61309,2021-08-06 11:08:11,Facebook,IC4,Data Scientist,302.0,9.0,3.00,Data Engineering,161.0,125.0,16.0,Female,Menlo Park,CA,US
62109,2021-08-12 21:54:41,Facebook,L5,Data Scientist,334.0,4.0,2.00,General,182.0,125.0,27.0,Male,New York,NY,US
62244,2021-08-13 16:22:47,Facebook,M2,Data Scientist,700.0,16.0,2.00,Product Data Science,255.0,375.0,70.0,Male,Menlo Park,CA,US
62344,2021-08-14 08:07:55,Facebook,M2,Data Scientist,631.0,11.0,0.00,General,275.0,287.0,69.0,Male,New York,NY,US


In [12]:
#exploring targeted results for amazon levels reporting
fb_df.level.value_counts()

IC4                          50
IC5                          39
IC3                          16
IC6                          14
L4                           13
E4                           11
L5                            7
E5                            4
E3                            3
L3                            3
L5                            2
M2                            2
E6                            2
L5 Product Growth Analyst     1
6                             1
IC7                           1
5                             1
M1                            1
l4                            1
L6                            1
Name: level, dtype: int64

In [13]:
#comparing medians for insights on compareable levels
fb_levels_df = fb_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
fb_levels_df

Unnamed: 0_level_0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,stockgrantvalue,bonus
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,370.0,190.0,8.0,3.0,140.0,40.0
6,338.0,216.0,10.0,0.0,80.0,42.0
E3,157.0,120.0,0.0,0.0,25.0,12.0
E4,230.0,152.5,3.0,0.0,55.0,16.0
E5,337.5,190.0,8.0,3.0,100.0,30.0
E6,482.5,216.5,15.0,2.75,217.5,48.5
IC3,175.0,125.0,2.0,1.0,30.0,11.0
IC4,218.5,151.5,3.5,1.0,46.5,16.0
IC5,295.0,182.0,7.0,2.0,86.5,27.0
IC6,385.0,218.0,10.0,3.0,130.0,43.5


In [14]:
#Correcting 5 to E5 as they have similar basesalary/bonus
fb_df.replace('5', 'E5', inplace=True)
#Correcting 6 to E6 as they have similar basesalary/bonus
fb_df.replace('6', 'E6', inplace=True)
#Correcting E4 to IC4 as they have similar basesalary/bonus
fb_df.replace('E4', 'IC4', inplace=True)
#Correcting E5 to IC5 as they have similar basesalary/bonus
fb_df.replace('E5', 'IC5', inplace=True)
#Correcting E6 to IC6 as they have similar basesalary/bonus
fb_df.replace('E6', 'IC6', inplace=True)

In [15]:
#exploring targeted results for amazon levels reporting
fb_df.level.value_counts()

IC4                          61
IC5                          44
IC6                          17
IC3                          16
L4                           13
L5                            7
L3                            3
E3                            3
L5                            2
M2                            2
l4                            1
M1                            1
IC7                           1
L6                            1
L5 Product Growth Analyst     1
Name: level, dtype: int64

In [16]:
#comparing medians for insights on compareable levels
fb_levels_df = fb_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
fb_levels_df

Unnamed: 0_level_0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,stockgrantvalue,bonus
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
E3,157.0,120.0,0.0,0.0,25.0,12.0
IC3,175.0,125.0,2.0,1.0,30.0,11.0
IC4,220.0,151.5,3.0,1.0,49.0,16.0
IC5,297.0,182.0,7.5,2.0,88.0,27.0
IC6,385.0,216.0,10.0,3.0,135.0,44.0
IC7,740.0,270.0,10.0,4.0,400.0,70.0
L3,165.0,115.0,0.0,0.0,25.0,15.0
L4,211.0,155.0,3.0,1.0,41.0,16.0
L5,300.0,185.0,7.0,1.0,81.0,29.0
L5,317.0,191.0,4.0,1.5,102.5,26.0


In [17]:
#Correcting L4 to IC4 as they have similar basesalary/bonus
fb_df.replace('L4', 'IC4', inplace=True)
fb_df.replace('I4', 'IC4', inplace=True)
#Correcting L5 to IC5 as they have similar basesalary/bonus
fb_df.replace('L5', 'IC5', inplace=True)
fb_df.replace('L5 Product Growth Analyst', 'IC5', inplace=True)
#Correcting L6 to IC6 as they have similar basesalary/bonus
fb_df.replace('L6', 'IC6', inplace=True)

In [18]:
fb_df.level.value_counts()

IC4    74
IC5    52
IC6    18
IC3    16
L3      3
E3      3
L5      2
M2      2
l4      1
M1      1
IC7     1
Name: level, dtype: int64

In [19]:
#Correcting L3/E3 to IC3 as they have similar basesalary/bonus
fb_df.replace('L3', 'IC3', inplace=True)
fb_df.replace('E3', 'IC3', inplace=True)

In [20]:
fb_df.level.value_counts()

IC4    74
IC5    52
IC3    22
IC6    18
L5      2
M2      2
l4      1
M1      1
IC7     1
Name: level, dtype: int64

In [21]:
#Correcting L5 to IC5 as they have similar basesalary/bonus
fb_df.replace('L5 ', 'IC5', inplace=True)
#Correcting l4 to IC4 as they have similar basesalary/bonus
fb_df.replace('l4', 'IC4', inplace=True)

In [22]:
fb_df.level.value_counts()

IC4    75
IC5    54
IC3    22
IC6    18
M2      2
M1      1
IC7     1
Name: level, dtype: int64

In [23]:
#removed M1 and M2 both software engineer managers
fb_df = fb_df.drop(index=fb_df[fb_df['level'] == 'M1'].index)
fb_df = fb_df.drop(index=fb_df[fb_df['level'] == 'M2'].index)
#removed IC7 due to high stock value as it indictates upper manangement
fb_df = fb_df.drop(index=fb_df[fb_df['level'] == 'IC7'].index)

In [24]:
#comparing medians for insights on compareable levels
fb_levels_df = fb_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
fb_levels_df

Unnamed: 0_level_0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,stockgrantvalue,bonus
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IC3,168.5,120.0,1.0,0.5,30.0,12.0
IC4,220.0,153.0,3.0,1.0,45.0,16.0
IC5,297.0,183.0,7.0,2.0,87.0,27.0
IC6,390.0,218.0,10.0,3.0,135.0,44.0


In [25]:
#placing each level in our categories based on similiar: basesalary/yearsofexperience/yearsatcompany/stockgrantvalue/bonus
fb_df.loc[fb_df.level == "IC3", "Category"] = "Entry"
fb_df.loc[fb_df.level == "IC4", "Category"] = "Entry"
fb_df.loc[fb_df.level == "IC5", "Category"] = "Mid"
fb_df.loc[fb_df.level == "IC6", "Category"] = "Senior"

In [26]:
fb_df

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,city,state,country,Category
858,2018-06-17 11:39:38,Facebook,IC5,Data Scientist,370.0,8.0,3.00,,190.0,140.0,40.0,Male,Seattle,WA,US,Mid
971,2018-07-05 13:43:39,Facebook,IC5,Data Scientist,225.0,11.0,6.00,,155.0,,,,Menlo Park,CA,US,Mid
1666,2018-09-06 00:50:18,Facebook,IC4,Data Scientist,200.0,5.0,0.25,Infrastructure,150.0,35.0,15.0,,Menlo Park,CA,US,Entry
1864,2018-09-17 15:37:28,Facebook,IC5,Data Scientist,240.0,10.0,1.00,Full Stack,171.0,45.0,24.0,,Seattle,WA,US,Mid
2766,2018-10-28 16:36:57,Facebook,IC3,Data Scientist,193.0,1.0,0.00,ML / AI,,,,,San Francisco,CA,US,Entry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59853,2021-07-26 13:28:11,Facebook,IC4,Data Scientist,304.0,9.0,3.00,Data Engineering,161.0,125.0,18.0,Female,Menlo Park,CA,US,Entry
60192,2021-07-28 22:39:30,Facebook,IC5,Data Scientist,265.0,11.0,2.00,Infrastructure,190.0,60.0,15.0,Male,Menlo Park,CA,US,Mid
61309,2021-08-06 11:08:11,Facebook,IC4,Data Scientist,302.0,9.0,3.00,Data Engineering,161.0,125.0,16.0,Female,Menlo Park,CA,US,Entry
62109,2021-08-12 21:54:41,Facebook,IC5,Data Scientist,334.0,4.0,2.00,General,182.0,125.0,27.0,Male,New York,NY,US,Mid


In [27]:
#import dependency
import pymongo
from pymongo import MongoClient

In [28]:
#establish connection to pymongo
conn ="mongodb://127.0.0.1:27017/"
client = MongoClient(conn)
db = client.ds_salaries

In [29]:
collection = db.facebook
fb_df_dict = fb_df.to_dict("records")

In [31]:
collection.insert_many(fb_df_dict)

<pymongo.results.InsertManyResult at 0x21fe1ed4800>

In [32]:
db.facebook.find_one()

{'_id': ObjectId('617c72c7f3513d0dc98561b5'),
 'timestamp': datetime.datetime(2018, 6, 17, 11, 39, 38),
 'company': 'Facebook',
 'level': 'IC5',
 'title': 'Data Scientist',
 'totalyearlycompensation': 370.0,
 'yearsofexperience': 8.0,
 'yearsatcompany': 3.0,
 'tag': '',
 'basesalary': 190.0,
 'stockgrantvalue': 140.0,
 'bonus': 40.0,
 'gender': 'Male',
 'city': 'Seattle',
 'state': 'WA',
 'country': 'US',
 'Category': 'Mid'}