In [2]:
#Import dependencies
import pandas as pd
import requests

#API pull for latest salary info from www.levels.fyi
salaryData = requests.get('https://www.levels.fyi/js/salaryData.json').json()
salary_df = pd.DataFrame(salaryData)

#raw table preview
salary_df.head(10)

#dropping columns that are not relevant to project
salary_df = salary_df.drop(['cityid', 'dmaid','rowNumber','otherdetails',], axis=1)
salary_df.head(10)

#converting to float to allow for summary stats
salary_df["totalyearlycompensation"] = pd.to_numeric(salary_df["totalyearlycompensation"])
salary_df["yearsofexperience"] = pd.to_numeric(salary_df["yearsofexperience"])
salary_df["yearsatcompany"] = pd.to_numeric(salary_df["yearsatcompany"])
salary_df["basesalary"] = pd.to_numeric(salary_df["basesalary"])
salary_df["stockgrantvalue"] = pd.to_numeric(salary_df["stockgrantvalue"])
salary_df["bonus"] = pd.to_numeric(salary_df["bonus"])

#coverting timestamp from object to datetime
salary_df['timestamp'] =  pd.to_datetime(salary_df['timestamp'], infer_datetime_format=True)

# Create separate cols for city, state and country
def split_location(location):
    items = location.split(', ')
    city = items[0]
    state = items[1]
    
    if len(items)==2:
        country = 'US'
    elif len(items)==3:
        country = items[2].strip()
    elif len(items)==4:
        country = ', '.join([i.strip() for i in items[2:]])
    else:
        country = None
        print(location)
        
    return [city, state, country]



salary_df['loc_items'] = salary_df.location.apply(lambda x: split_location(x))
salary_df['city'] = salary_df.loc_items.apply(lambda x: x[0])
salary_df['state'] = salary_df.loc_items.apply(lambda x: x[1])
salary_df['country'] = salary_df.loc_items.apply(lambda x: x[2])

salary_df.head()

# droping location column  
salary_df = salary_df.drop(['location','loc_items'], axis=1)
salary_df.head(10)

#isolating US data for further exploration
us_df = salary_df[salary_df.country=='US'].copy()

#isolating us data to data scientist titles
us_df = us_df[us_df.title=='Data Scientist'].copy()
us_df

#exploring us/datascience/IL at amazon
apple_df = us_df[us_df.company=='Apple'].copy()
apple_df

#exploring targeted results for amazon levels reporting
apple_df.level.value_counts()

#Correcting ict5 to ICT5 as they have similar basesalary/bonus
apple_df.replace('ict5', 'ICT5', inplace=True)
apple_df.replace('IC4', 'ICT4', inplace=True)



import plotly.express as px
  
fig = px.histogram(apple_df[apple_df['level']=='L3'], x="totalyearlycompensation")
fig.show()

import plotly.express as px
  
fig = px.histogram(apple_df[apple_df['level']=='ICT4'], x="totalyearlycompensation")
fig.show()

import scipy.stats as st
import numpy as np

data = apple_df[apple_df['level']=='ICT4']['totalyearlycompensation'].to_list()

st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))

#comparing medians for insights on compareable levels
apple_levels_df = apple_df.groupby(['level']).agg({'totalyearlycompensation':'mean', 'basesalary':'mean', 'yearsofexperience':'mean', 'yearsatcompany':'mean',
                                                'stockgrantvalue':'mean', 'bonus':'mean'  })
apple_levels_df

#comparing medians for insights on compareable levels
apple_levels_df = apple_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
apple_levels_df

#Correcting 4 to ICT4 as they have similar basesalary/bonus
apple_df.replace('4', 'ICT4', inplace=True)
#Correcting 3 to ICT3 as they have similar basesalary/bonus
apple_df.replace('3', 'ICT3', inplace=True)
#Correcting L3 to ICT3 as they have similar basesalary/bonus
apple_df.replace('L3', 'ICT3', inplace=True)
#Correcting L4 to ICT4 as they have similar basesalary/bonus
apple_df.replace('L4', 'ICT4', inplace=True)

#exploring targeted results for amazon levels reporting
apple_df.level.value_counts()

#comparing medians for insights on compareable levels
apple_levels_df = apple_df.groupby(['level']).agg({'totalyearlycompensation':'median', 'basesalary':'median', 'yearsofexperience':'median', 'yearsatcompany':'median',
                                                'stockgrantvalue':'median', 'bonus':'median'  })
apple_levels_df

#dropping single value for e3 and E4 as the salary/stockgrantvalue/bonus are out of allignment with the other levels
#removed M1 and M2 both software engineer managers
apple_df = apple_df.drop(index=apple_df[apple_df['level'] == 'e3'].index)
apple_df = apple_df.drop(index=apple_df[apple_df['level'] == 'E4'].index)
#removed ICT6 due to high base salary and high stock value as it indictates upper manangement
apple_df = apple_df.drop(index=apple_df[apple_df['level'] == 'ICT6'].index)

apple_df.level.value_counts()

#placing each level in our categories based on similiar: basesalary/yearsofexperience/yearsatcompany/stockgrantvalue/bonus
apple_df.loc[apple_df.level == "ICT3", "Category"] = "Entry"
apple_df.loc[apple_df.level == "ICT4", "Category"] = "Mid"
apple_df.loc[apple_df.level == "ICT5", "Category"] = "Senior"

apple_df

#import dependency
import pymongo
from pymongo import MongoClient

#establish connection to pymongo
conn ="mongodb://127.0.0.1:27017/"
client = MongoClient(conn)
db = client.ds_salaries

collection = db.apple

apple_df_dict = apple_df.to_dict("records")

collection.insert_many(apple_df_dict)

db.ds_salaries

db.apple.find_one()