In [1]:
from pymongo import MongoClient
import requests
import pandas as pd
import os
from dotenv import load_dotenv
import json
import folium
import src.cleaning_functions as fn
import src.google_api_functions as gf
import src.mongo_query_functions as mf
load_dotenv()

True

In [2]:

dbName = "companies"
mongodbURL = f"mongodb://localhost/{dbName}"
print(mongodbURL)
client = MongoClient(mongodbURL, connectTimeoutMS=10000,serverSelectionTimeoutMS=10000)
db = client.get_database()

mongodb://localhost/companies


# Filtering for offices in England

In [3]:
#Find the all the different country codes to choose the one for england
query={'offices': { '$exists': 'true', '$not': {'$size': 0} }}
web_offices = list(db.companies.find(query,{"offices":1,"name":1}))

countries=set()
for i,e in enumerate(web_offices):
    countries.add(e['offices'][0]['country_code'])
#countries #GBR is the code for Great Britain

In [4]:
#Start with filtering only for companies with offices in England
query={'offices':{'$elemMatch':{'country_code':'GBR'}}}
project = {"name":1, "offices":1,'total_money_raised':1,'founded_year':1, 'category_code':1}
gb_offices = list(db.companies.find(query,project))

In [5]:
office_df = pd.DataFrame(gb_offices).explode('offices')

In [6]:
print(office_df.shape)
office_df.head(3)

(2574, 6)


Unnamed: 0,_id,name,category_code,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297da0,Babelgum,games_video,2007.0,$13.2M,"{'description': '', 'address1': '', 'address2'..."
1,52cdef7c4bab8bd675297da6,SpinVox,messaging,,$106M,"{'description': 'Corporate Headquarters', 'add..."
2,52cdef7c4bab8bd675297da8,OpenX,advertising,2008.0,$75.5M,"{'description': 'Headquarters', 'address1': '8..."


In [7]:
#Extract info from column offices with the extract_loc function
offices = office_df.apply(fn.extract_loc,axis=1, result_type="expand")
offices.head(3)

Unnamed: 0,0,1,2
0,GBR,London,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
1,GBR,Buckinghamshire,
2,USA,Pasadena,"{'type': 'Point', 'coordinates': [-118.1327468..."


In [8]:
#Append it to office_df
offices_loc = pd.concat([office_df, offices], axis=1)
offices_loc.columns = ['__id', 'name', 'category_code','founded_year',  'raised', 'offices','country', 'city', 'location']

#Only offices in england
offices_loc = offices_loc[offices_loc.country=='GBR']

#Only offices with a valid lat long
offices_loc = offices_loc[~offices_loc.location.isnull()]

#Explore the result
print(offices_loc.shape)
print(offices_loc.dtypes)
offices_loc.head(3)

(611, 9)
__id              object
name              object
category_code     object
founded_year     float64
raised            object
offices           object
country           object
city              object
location          object
dtype: object


Unnamed: 0,__id,name,category_code,founded_year,raised,offices,country,city,location
0,52cdef7c4bab8bd675297da0,Babelgum,games_video,2007.0,$13.2M,"{'description': '', 'address1': '', 'address2'...",GBR,London,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
4,52cdef7c4bab8bd675297dcc,AllPeers,web,2004.0,$0,"{'description': None, 'address1': None, 'addre...",GBR,Oxford,"{'type': 'Point', 'coordinates': [-1.255824, 5..."
5,52cdef7c4bab8bd675297de9,Zamzar,web,2006.0,$0,"{'description': '', 'address1': '', 'address2'...",GBR,Southampton,"{'type': 'Point', 'coordinates': [-1.3610845, ..."


In [9]:
#Formatting the raised money column with the clean_raised function
offices_loc['raised'] = offices_loc.raised.apply(fn.clean_raised)

In [10]:
offices_loc=offices_loc[['name', 'founded_year', 'category_code', 'raised', 'city', 'location']].reset_index()
offices_loc['location'][4]

{'type': 'Point', 'coordinates': [-2.250166, 53.473606]}

In [82]:
#Export to json:
offices_loc.to_json("OUTPUT/offices_loc.json",orient="records")

In [83]:
#Incorporating new collection to database and create geoloc index
!mongoimport --db companies --collection offices_loc_england --drop --jsonArray OUTPUT/offices_loc.json


2020-04-19T10:05:58.712+0200	connected to: mongodb://localhost/
2020-04-19T10:05:58.713+0200	dropping: companies.offices_loc_england
2020-04-19T10:05:59.145+0200	611 document(s) imported successfully. 0 document(s) failed to import.


In [11]:
db.list_collection_names()
#db.offices_loc_england.create_index({'location':'2dsphere'}) #This function does not work for me...

['offices_eng_pro_air',
 'companies',
 'offices_loc_england',
 'office_near_pro',
 'starbucks']

In [22]:
#Make sure that I assigned a correct GeoJson index in Mongo compass
db.offices_loc_england.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'companies.offices_loc_england'},
 'location_2dsphere': {'v': 2,
  'key': [('location', '2dsphere')],
  'ns': 'companies.offices_loc_england',
  'background': False,
  '2dsphereIndexVersion': 3}}

## Filtering to accomodate developers: top notch companies (raised 1 mill dollars) nearby

In [27]:
offices = list(db.offices_loc_england.find({}))

In [28]:
len(offices)

611

In [29]:
#Check wich categories are available to filter out those I am not interested in.
p = list(db.companies.find({},{'category_code':1}))
category = set()
for e in p:
    category.add(e['category_code'])
#category 

In [30]:
new_offices = mf.near_filter(db.offices_loc_england, offices,mf.near_success_offices, 'pro_nearby')
len(new_offices)

404

In [31]:
new_offices_df = pd.DataFrame(new_offices)
new_offices_df=new_offices_df.drop('_id', axis=1)
new_offices_df.head(3)

Unnamed: 0,index,name,founded_year,category_code,raised,city,location,pro_nearby
0,0,Babelgum,2007.0,games_video,13.2,London,"{'type': 'Point', 'coordinates': [-6.267494, 5...",1
1,3,KickApps,2004.0,enterprise,32.0,London,"{'type': 'Point', 'coordinates': [-0.1356235, ...",36
2,5,ConnectMeAnywhere,2006.0,web,0.0,London W11 2HX,"{'type': 'Point', 'coordinates': [-0.205349, 5...",11


In [103]:
new_offices_df.to_json("OUTPUT/office_near_pro.json",orient="records")

In [104]:
!mongoimport --db companies --collection office_near_pro --drop --jsonArray OUTPUT/office_near_pro.json

2020-04-19T10:07:57.902+0200	connected to: mongodb://localhost/
2020-04-19T10:07:57.903+0200	dropping: companies.office_near_pro
2020-04-19T10:07:58.357+0200	404 document(s) imported successfully. 0 document(s) failed to import.


## Second filter: travelling for executives


In [32]:
db.office_near_pro.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'companies.office_near_pro'},
 'location_2dsphere': {'v': 2,
  'key': [('location', '2dsphere')],
  'ns': 'companies.office_near_pro',
  'background': False,
  '2dsphereIndexVersion': 3}}

In [11]:
#Get apiKey for google:
apiKey = os.getenv("GOOGLE_API2")

In [118]:
##Offices that are within 20 kilometers from an international airport
filtered_lst=[]
for of in new_offices:
    n_airport = len(gf.get_google(gf.get_latlong(of), radius=20000, typ ='airport',apiKey=apyKey, keyw='international'))
    if n_airport > 0:
        of['airp_nearby'] = n_airport 
        filtered_lst.append(of)

In [34]:
#Luckily most of our candidates are in London and Manchester, where travelling is easy
len(filtered_lst)

378

new_offices_df = pd.DataFrame(filtered_lst)
new_offices_df=new_offices_df.drop('_id', axis=1)
new_offices_df.head()

new_offices_df.to_json("OUTPUT/offices_pro_air.json",orient="records")

!mongoimport --db companies --collection offices_eng_pro_air --jsonArray OUTPUT/offices_loc.json


In [33]:
with open('OUTPUT/offices_pro_air.json', 'r') as f:
        filtered = json.load(f)
filtered_lst=list(filtered)
len(filtered_lst)

378

## Kindergartens and nurseries and primary schools

In [21]:
##Offices that are within 1 kilometers from a nursery or kindergarten. 
#This time we will be a bit more demanding and make the name include one of a series of words:
kinder_lst=[]
keys = ['kindergarten', 'nursery', 'primary']

for of in filtered_lst:
    kinder = gf.get_google(gf.get_latlong(of), radius=1000, typ ='kindergarten', keyw='kindergarten', apiKey=apiKey)
    count=0
    for e in kinder:
        if any(i in e['name'].lower() for i in keys):
            count+=1
    if count > 0:
        of['kinder'] = count
        kinder_lst.append(of)

In [23]:
kinder_lst

[{'index': 1,
  'name': 'AllPeers',
  'founded_year': 2004.0,
  'category_code': 'web',
  'raised': 0.0,
  'city': 'Oxford',
  'location': {'type': 'Point', 'coordinates': [-1.255824, 51.752276]},
  'kinder': 2},
 {'index': 4,
  'name': 'Google',
  'founded_year': 1998.0,
  'category_code': 'search',
  'raised': 555.0,
  'city': 'Manchester',
  'location': {'type': 'Point', 'coordinates': [-2.250166, 53.473606]},
  'kinder': 2},
 {'index': 5,
  'name': 'ConnectMeAnywhere',
  'founded_year': 2006.0,
  'category_code': 'web',
  'raised': 0.0,
  'city': 'London W11 2HX',
  'location': {'type': 'Point', 'coordinates': [-0.205349, 51.514935]},
  'kinder': 3},
 {'index': 7,
  'name': 'PowerReviews',
  'founded_year': None,
  'category_code': 'social',
  'raised': 40.1,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.104408, 51.527099]},
  'kinder': 3},
 {'index': 11,
  'name': 'Covestor',
  'founded_year': 2005.0,
  'category_code': 'finance',
  'raised': 23.9,
  'city

In [156]:
#For those with kids a bit older, let's make sure that they also have a primary school nearby

school_lst=[]
keys = ['school', 'primary']

for of in kinder_lst:
    school = gf.get_google(gf.get_latlong(of), radius=1000, typ ='kindergarten', keyw='kindergarten')
    count=0
    for e in school:
        if any(i in e['name'].lower() for i in keys):
            count+=1
    if count > 0:
        of['school'] = count
        school_lst.append(of)

In [157]:
len(school_lst)

152

## Starbucks


In [162]:
#The API from starbucks does not allow any email domain to register, 
# therefore, I will use actual store lat long from a kaggle dataset

starbuck = pd.read_csv('INPUT/starbucks_stores.csv')
#filtering to stores in england
starbuck = starbuck[starbuck.Country=='GB']

In [194]:
starbuck['location'] = starbuck.apply(mf.create_geojson, axis=1)

In [195]:
starbucks = starbuck[['Store Name', 'City', 'location']]
starbucks.to_json("OUTPUT/starbucks.json",orient="records")

In [196]:
!mongoimport --db companies --collection starbucks --drop --jsonArray OUTPUT/starbucks.json

2020-04-19T12:57:15.940+0200	connected to: mongodb://localhost/
2020-04-19T12:57:15.946+0200	dropping: companies.starbucks
2020-04-19T12:57:16.463+0200	901 document(s) imported successfully. 0 document(s) failed to import.


In [251]:
starbucks_lst=[]
for of in school_lst:
    n_strbk = mf.near_starbuck(of['location'])
    if n_strbk > 0:
        of['starbucks'] = n_strbk 
        starbucks_lst.append(of)

  This is separate from the ipykernel package so we can avoid doing imports until


In [252]:
len(starbucks_lst)

19

## The vegan CEO

In [253]:
ceo_lst = []
for of in starbucks_lst:
    n = len(gf.get_google(gf.get_latlong(of), radius=200, typ='restaurant', keyw='vegan'))
    if n > 0:
        of['starbucks'] = n 
        ceo_lst.append(of)

In [4]:
len(ceo_lst)

NameError: name 'ceo_lst' is not defined

## Friday disco party


In [243]:
final = []
for of in ceo_lst:
    n = len(gf.get_google(gf.get_latlong(of), radius=500, typ='bar', keyw='night-club'))
    if n > 0:
        of['disco'] = n 
        final.append(of)

In [3]:
len(final)

NameError: name 'final' is not defined

In [245]:
final

[{'_id': ObjectId('5e9c0666c0585134d647389f'),
  'index': 25,
  'name': 'blinkbox',
  'founded_year': 2006.0,
  'category_code': 'games_video',
  'raised': 0.0,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.10759, 51.520779]},
  'pro_nearby': 34,
  'airp_nearby': 6,
  'kinder': 2,
  'school': 2,
  'starbucks': 2,
  'disco': 1},
 {'_id': ObjectId('5e9c0666c0585134d64738b5'),
  'index': 48,
  'name': 'Tipped',
  'founded_year': 2007.0,
  'category_code': 'web',
  'raised': 0.0,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.0778345, 51.5211159]},
  'pro_nearby': 31,
  'airp_nearby': 6,
  'kinder': 1,
  'school': 3,
  'starbucks': 11,
  'disco': 11},
 {'_id': ObjectId('5e9c0666c0585134d64738c1'),
  'index': 59,
  'name': 'Playfish',
  'founded_year': 2007.0,
  'category_code': 'games_video',
  'raised': 21.0,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.1984802, 51.4991094]},
  'pro_nearby': 12,
  'airp_nearby': 8,

In [315]:
final_df = pd.DataFrame(final)
final_df=final_df.drop('_id', axis=1)
final_df = final_df.sort_values(by=['pro_nearby', 
                                    'airp_nearby', 'kinder',
                                    'school', 'starbucks','disco'], ascending=False).reset_index()
final_df.head()


Unnamed: 0,level_0,index,name,founded_year,category_code,raised,city,location,pro_nearby,airp_nearby,kinder,school,starbucks,disco
0,6,215,Big Property Ladder,1995.0,other,0.0,London.,"{'type': 'Point', 'coordinates': [-0.1392765, ...",37,7,1,1,4,20
1,8,298,litl,2007.0,hardware,0.0,London,"{'type': 'Point', 'coordinates': [-0.139244, 5...",37,7,1,1,4,20
2,9,304,Box UK,1998.0,software,0.0,London,"{'type': 'Point', 'coordinates': [-0.1392447, ...",37,7,1,1,4,20
3,10,384,Quick TV,2007.0,enterprise,3.613,London,"{'type': 'Point', 'coordinates': [-0.1392765, ...",37,7,1,1,4,20
4,13,517,Eglue Business Technologies,2001.0,software,27.0,London,"{'type': 'Point', 'coordinates': [-0.1392765, ...",37,7,1,1,4,20


In [304]:
final_df.to_json("OUTPUT/final.json",orient="records")

In [None]:
#!mongoimport --db companies --collection offices_eng_pro_air --jsonArray OUTPUT/offices_loc.json


## Visualizing

In [317]:
final_df.location[0]['coordinates']

[-0.1392765, 51.5115309]

In [318]:
m = folium.Map(location=[51.520779, -0.10759],
               zoom_start=13,
              tiles='OpenStreet Map'#'Stamen Terrain'
              )

tooltip = 'Click me!'

for i,e in enumerate(final_df['location']):
    long= e['coordinates'][-1]
    lat=e['coordinates'][0]
    if i<5:
        color='green'
    elif i<10: color='blue'
    else: color='red'
    folium.Marker([long, lat], 
                  popup=f'<i>#{i}</i>', 
                  tooltip=tooltip, 
                  icon=folium.Icon(color=color)).add_to(m)

folium.CircleMarker(
    location=[51.5115309, -0.1392765],
    radius=50,
    popup='Optimal area',
    color='#3186cc',
    fill=True,
    fill_color='#3186cc'
).add_to(m)

#for i,e in enumerate(final_df['location']):
#    i = folium.GeoJson(
#        data=e,
#        tooltip=tooltip)
#
#    i.add_child(folium.Popup('outline Popup on GeoJSON'))
#    i.add_to(m)


m.add_child(folium.LatLngPopup())


In [305]:
help(folium.Icon)

Help on class Icon in module folium.map:

class Icon(branca.element.MacroElement)
 |  Creates an Icon object that will be rendered
 |  using Leaflet.awesome-markers.
 |  
 |  Parameters
 |  ----------
 |  color : str, default 'blue'
 |      The color of the marker. You can use:
 |  
 |          ['red', 'blue', 'green', 'purple', 'orange', 'darkred',
 |           'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue',
 |           'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen',
 |           'gray', 'black', 'lightgray']
 |  
 |  icon_color : str, default 'white'
 |      The color of the drawing on the marker. You can use colors above,
 |      or an html color code.
 |  icon : str, default 'info-sign'
 |      The name of the marker sign.
 |      See Font-Awesome website to choose yours.
 |      the `prefix` as well.
 |  angle : int, default 0
 |      The icon will be rotated by this amount of degrees.
 |  prefix : str, default 'glyphicon'
 |      The prefix states the source

In [324]:
res=requests.get('https://api.meetup.com/find/topics?query=design')

In [325]:
res.json()

[{'id': 61781,
  'name': 'UX Design',
  'urlkey': 'ux-design',
  'group_count': 3105,
  'member_count': 2068031,
  'description': "Find out what's happening in UX Design Meetup groups around the world and start meeting up with the ones near you.",
  'lang': 'en_US'},
 {'id': 19225,
  'name': 'Game Design',
  'urlkey': 'game-design',
  'group_count': 1024,
  'member_count': 509189,
  'description': "Find out what's happening in Game Design Meetup groups around the world and start meeting up with the ones near you.",
  'lang': 'en_US'},
 {'id': 21437,
  'name': 'Fashion Design',
  'urlkey': 'fashion-design',
  'group_count': 598,
  'member_count': 376737,
  'description': "Find out what's happening in Fashion Design Meetup groups around the world and start meeting up with the ones near you.",
  'lang': 'en_US'},
 {'id': 18993,
  'name': 'Pool Design/Water Features Design and Build',
  'urlkey': 'pool-design-water-features-design-and-build',
  'group_count': 0,
  'member_count': 0,
  'des

In [1]:
#To Save a session:

import dill
#dill.dump_session('mongo_project.db')
#To restore a session
dill.load_session('mongo_project.db')


EOFError: Ran out of input

- 20 Designers
- 5 UI/UX Engineers
- ##10 Frontend Developers
- 15 Data Engineers
- 5 Backend Developers
- ##20 Account Managers
- 1 Maintenance guy that loves basketball
- 10 Executives
- 1 CEO/President

As a data engineer you have asked all the employees to show their preferences on where to place the new office. Your goal is to place the **new company offices** in the best place for the company to grow. You have to found a place that more or less covers all the following requirements. Note that **it's impossible to cover all requirements**, so you have to prioritize at your glance.

- Designers like to go to design talks and share knowledge. There must be some nearby companies that also do design.
- ###30% of the company have at least 1 child.
- ###Developers like to be near successful tech startups that have raised at least 1 Million dollars.
- ###Executives like Starbucks A LOT. Ensure there's a starbucks not to far.
- ###Account managers need to travel a lot
- All people in the company have between 25 and 40 years, give them some place to go to party.
- ###Nobody in the company likes to have companies with more than 10 years in a radius of 2 KM.
- ###The CEO is Vegan