In [1]:
#Import dependencies
import pandas as pd
import requests

#API pull for latest salary info from www.levels.fyi
salaryData = requests.get('https://www.levels.fyi/js/salaryData.json').json()
salary_df = pd.DataFrame(salaryData)

#dropping columns that are not relevant to project
salary_df = salary_df.drop(['cityid', 'dmaid','rowNumber','otherdetails','tag', 'basesalary', 'stockgrantvalue', 'bonus', 'gender'], axis=1)

#converting to float to allow for summary stats
salary_df["totalyearlycompensation"] = pd.to_numeric(salary_df["totalyearlycompensation"])
salary_df["yearsofexperience"] = pd.to_numeric(salary_df["yearsofexperience"])
salary_df["yearsatcompany"] = pd.to_numeric(salary_df["yearsatcompany"])

#coverting timestamp from object to datetime
salary_df['timestamp'] =  pd.to_datetime(salary_df['timestamp'], infer_datetime_format=True)

# Create separate cols for city, state and country
def split_location(location):
    items = location.split(', ')
    city = items[0]
    state = items[1]
    
    if len(items)==2:
        country = 'US'
    elif len(items)==3:
        country = items[2].strip()
    elif len(items)==4:
        country = ', '.join([i.strip() for i in items[2:]])
    else:
        country = None
        print(location)
        
    return [city, state, country]

salary_df['loc_items'] = salary_df.location.apply(lambda x: split_location(x))
salary_df['city'] = salary_df.loc_items.apply(lambda x: x[0])
salary_df['state'] = salary_df.loc_items.apply(lambda x: x[1])
salary_df['country'] = salary_df.loc_items.apply(lambda x: x[2])

# dropping location column  
salary_df = salary_df.drop(['location','loc_items'], axis=1)

#isolating US data for further exploration
us_df = salary_df[salary_df.country=='US'].copy()

#isolating us data to data scientist titles
us_df = us_df[us_df.title=='Data Scientist'].copy()


#merging dataframes into on collection
#cleaned_data = pd.concat([apple_df, amazon_df, fb_df, google_df, micro_df ])

#import dependency
#import pymongo
#from pymongo import MongoClient
#establish connection to pymongo
#conn ="mongodb://127.0.0.1:27017/"
#client = MongoClient(conn)
#db = client.ds_salaries
#collection = db.top5
#cleaned_dict = cleaned_data.to_dict("records")
#collection.insert_many(cleaned_dict)

In [2]:
us_df


Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,city,state,country
745,2018-06-05 14:06:30,LinkedIn,Senior,Data Scientist,233.0,4.0,0.0,San Francisco,CA,US
772,2018-06-08 00:29:47,Amazon,L4,Data Scientist,140.0,2.0,2.0,Seattle,WA,US
776,2018-06-08 09:49:25,Microsoft,64,Data Scientist,218.0,11.0,11.0,Seattle,WA,US
782,2018-06-08 17:55:09,ebay,26,Data Scientist,180.0,10.0,5.0,San Jose,CA,US
796,2018-06-10 19:39:35,Twitter,Staff,Data Scientist,500.0,4.0,4.0,San Francisco,CA,US
...,...,...,...,...,...,...,...,...,...,...
62569,2021-08-16 16:17:19,IBM,L5,Data Scientist,145.0,6.0,5.0,New City,NY,US
62578,2021-08-16 17:08:58,Booz Allen Hamilton,Senior Consultant,Data Scientist,110.0,0.0,0.0,West McLean,VA,US
62600,2021-08-16 21:02:37,Xandr,L1,Data Scientist,120.0,1.0,0.0,Portland,OR,US
62610,2021-08-16 22:19:48,Facebook,L4,Data Scientist,233.0,2.0,2.0,Menlo Park,CA,US


In [3]:
us_df['city'].value_counts()

San Francisco       373
Seattle             311
New York            245
Redmond             106
Menlo Park           94
                   ... 
Emeryville            1
Campbell              1
Plymouth Meeting      1
Fort Worth            1
Watertown             1
Name: city, Length: 160, dtype: int64

In [4]:
origins_df=us_df[['city', 'state']]
origins_df

Unnamed: 0,city,state
745,San Francisco,CA
772,Seattle,WA
776,Seattle,WA
782,San Jose,CA
796,San Francisco,CA
...,...,...
62569,New City,NY
62578,West McLean,VA
62600,Portland,OR
62610,Menlo Park,CA


In [5]:
origins_df['city'] = origins_df.city.apply(lambda x: x.strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [6]:
clean_cities = origins_df.city.unique()
len(clean_cities)

160

In [7]:
len([group for idx,group in origins])

NameError: name 'origins' is not defined

In [8]:
for idx, group in origins:
    if group.city.iloc[0] == 'Seattle':
        print(group)
        print('___________')

NameError: name 'origins' is not defined

In [9]:
origins=origins_df.groupby('city')

In [10]:
origins_df['city_state'] = origins_df.apply(lambda row: row.city + '%20C' + row.state, axis=1)
new_origins = origins_df.groupby('city_state')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
city_states=origins_df['city_state'].unique()

In [12]:
len(city_states)

166

In [14]:
from api_keys import g_key
import gmaps
import requests
base_url = "https://maps.googleapis.com/maps/api/distancematrix/json?"
distances=[]
for i in range(len(city_states)):
    origin_city=city_states[i]

    destination_city="New York%20CNY"
    
    
    try:
    
        url=base_url+"origins="+origin_city+"&destinations="+destination_city+"%20C"+"&key="+g_key

        payload={}
        headers = {}

        response = requests.request("GET", url, headers=headers, data=payload)

        x=response.json()

        y=x["rows"][0]["elements"][0]["distance"]["value"]
        print(y)

        distances.append(y)
    
    except:
        distances.append('None')
        print(city_states[i])

    
    


4680488
4606061
4733129
2566387
4586918
4741100
4731642
10007
4727625
38727
4470171
381733
340801
4523564
4584771
4733749
4747337
4732087
1938245
2809697
4519772
1015931
2496062
3215226
815229
1401271
665996
2085043
4489857
4493846
1276254
2871490
4698504
4732030
2497842
866643
2867951
4663595
4665078
810167
2889247
549822
611951
796298
395724
202657
2626661
4719849
2865975
4653571
1033105
3491643
4709830
3883807
4693331
1929914
191140
Tel Aviv%20CIsrael
4718202
11544
182242
4708972
2546600
1825894
587930
1971552
991461
75904
1544995
3999966
1926081
789435
4689019
375887
4674587
1914459
4712832
2780473
162438
2537781
57517
3881496
4696866
396380
4666239
4796489
1317261
183239
1621014
378304
2076857
1737306
3953203
1785882
2925076
664942
749359
4735874
4676608
973753
3975893
1727766
76782
63943
328199
1432926
1925966
113590
312385
1147317
2935879
4542341
1192694
1446177
1789851
3527347
328125
4680488
408933
165349
4685661
984761
1314248
64772
1229841
2862155
4491238
1723480
1377439
7969

In [15]:
len(distances)

166

In [16]:
city_distance=pd.DataFrame(data={
    "city": city_states,
    "distance from NY": distances
})

In [17]:
city_distance.to_csv("distances.csv")

In [None]:
len([group for idx,group in new_origins])

In [None]:
len(cities)

In [None]:
x=origins_df['city'].value_counts()
type(x)

In [None]:
from api_keys import g_key
import gmaps
import requests

In [None]:
base_url = "https://maps.googleapis.com/maps/api/distancematrix/json?"

In [None]:
origin_city="Chicago"
origin_state= "IL"

destination_city="New York"
destination_state="NY"

url=base_url+"origins="+origin_city+"%20C"+origin_state+"&destinations="+destination_city+"%20C"+destination_state+"&key="+g_key

In [None]:
url

In [None]:
payload={}
headers = {}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)


In [None]:
x=response.json()

In [None]:
x["rows"][0]["elements"][0]["distance"]["value"]