In [43]:
from pymongo import MongoClient
import requests
import pandas as pd
import os
from dotenv import load_dotenv
import json
import folium
import src.cleaning_functions as fn
import src.google_api_functions as gf
import src.mongo_query_functions as mf
load_dotenv()

True

In [44]:

dbName = "companies"
mongodbURL = f"mongodb://localhost/{dbName}"
print(mongodbURL)
client = MongoClient(mongodbURL, connectTimeoutMS=10000,serverSelectionTimeoutMS=10000)
db = client.get_database()

mongodb://localhost/companies


# Filtering for offices in England

In [45]:
#Find the all the different country codes to choose the one for england
query={'offices': { '$exists': 'true', '$not': {'$size': 0} }}
web_offices = list(db.companies.find(query,{"offices":1,"name":1}))

countries=set()
for i,e in enumerate(web_offices):
    countries.add(e['offices'][0]['country_code'])
#countries #GBR is the code for Great Britain

In [46]:
#Start with filtering only for companies with offices in England
query={'offices':{'$elemMatch':{'country_code':'GBR'}}}
project = {"name":1, "offices":1,'total_money_raised':1,'founded_year':1, 'category_code':1}
gb_offices = list(db.companies.find(query,project))

In [47]:
office_df = pd.DataFrame(gb_offices).explode('offices')

In [48]:
print(office_df.shape)
office_df.head(3)

(2574, 6)


Unnamed: 0,_id,name,category_code,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297da0,Babelgum,games_video,2007.0,$13.2M,"{'description': '', 'address1': '', 'address2'..."
1,52cdef7c4bab8bd675297da6,SpinVox,messaging,,$106M,"{'description': 'Corporate Headquarters', 'add..."
2,52cdef7c4bab8bd675297da8,OpenX,advertising,2008.0,$75.5M,"{'description': 'Headquarters', 'address1': '8..."


In [49]:
#Extract get geoJson format from column offices with the extract_loc function
offices = office_df.apply(fn.extract_loc,axis=1, result_type="expand")
offices.head(3)

Unnamed: 0,0,1,2
0,GBR,London,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
1,GBR,Buckinghamshire,
2,USA,Pasadena,"{'type': 'Point', 'coordinates': [-118.1327468..."


In [50]:
#Append it to office_df
offices_loc = pd.concat([office_df, offices], axis=1)
offices_loc.columns = ['__id', 'name', 'category_code','founded_year',  'raised', 'offices','country', 'city', 'location']

#Only offices in england
offices_loc = offices_loc[offices_loc.country=='GBR']

#Only offices with a valid lat long
offices_loc = offices_loc[~offices_loc.location.isnull()]

#Explore the result
print(offices_loc.shape)
print(offices_loc.dtypes)
offices_loc.head(3)

(611, 9)
__id              object
name              object
category_code     object
founded_year     float64
raised            object
offices           object
country           object
city              object
location          object
dtype: object


Unnamed: 0,__id,name,category_code,founded_year,raised,offices,country,city,location
0,52cdef7c4bab8bd675297da0,Babelgum,games_video,2007.0,$13.2M,"{'description': '', 'address1': '', 'address2'...",GBR,London,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
4,52cdef7c4bab8bd675297dcc,AllPeers,web,2004.0,$0,"{'description': None, 'address1': None, 'addre...",GBR,Oxford,"{'type': 'Point', 'coordinates': [-1.255824, 5..."
5,52cdef7c4bab8bd675297de9,Zamzar,web,2006.0,$0,"{'description': '', 'address1': '', 'address2'...",GBR,Southampton,"{'type': 'Point', 'coordinates': [-1.3610845, ..."


In [51]:
#Formatting the raised money column with the clean_raised function
offices_loc['raised'] = offices_loc.raised.apply(fn.clean_raised)

In [52]:
offices_loc=offices_loc[['name', 'founded_year', 'category_code', 'raised', 'city', 'location']].reset_index()
offices_loc['location'][4]

{'type': 'Point', 'coordinates': [-2.250166, 53.473606]}

In [53]:
#Export to json:
offices_loc.to_json("OUTPUT/offices_loc.json",orient="records")

In [54]:
#Incorporating new collection to database and create geoloc index
!mongoimport --db companies --collection offices_loc_england --drop --jsonArray OUTPUT/offices_loc.json


2020-04-20T19:19:56.214+0200	connected to: mongodb://localhost/
2020-04-20T19:19:56.215+0200	dropping: companies.offices_loc_england
2020-04-20T19:19:56.662+0200	611 document(s) imported successfully. 0 document(s) failed to import.


In [55]:
db.list_collection_names()
#Create a GeoJson index for this collection
db.offices_loc_england.create_index([("location", '2dsphere')])

'location_2dsphere'

In [56]:
#Make sure that I assigned a correct GeoJson index in Mongo compass
db.offices_loc_england.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'companies.offices_loc_england'},
 'location_2dsphere': {'v': 2,
  'key': [('location', '2dsphere')],
  'ns': 'companies.offices_loc_england',
  '2dsphereIndexVersion': 3}}

## Filtering to accomodate developers: top notch companies (raised 1 mill dollars) nearby

In [57]:
offices = list(db.offices_loc_england.find({}))

In [58]:
len(offices)

611

In [59]:
#Check wich categories are available to filter out those I am not interested in.
p = list(db.companies.find({},{'category_code':1}))
category = set()
for e in p:
    category.add(e['category_code'])
#category 

In [60]:
#Filter for offices with tech companies that have raised more than 1 mill within 5km range
new_offices = mf.near_filter(db.offices_loc_england, offices,mf.near_success_offices, 'pro_nearby')
len(new_offices)

404

## Second filter: travelling for executives


In [61]:
#Get apiKey for google:
apiKey = os.getenv("GOOGLE_API2")

In [62]:
##Offices that are within 20 kilometers from an international airport
filtered_lst=[]
for of in new_offices:
    n_airport = len(gf.get_google(gf.get_latlong(of), radius=20000, typ ='airport',apiKey=apiKey, keyw='international'))
    if n_airport > 0:
        of['airp_nearby'] = n_airport 
        filtered_lst.append(of)

In [63]:
#Luckily most of our candidates are in London and Manchester, where travelling is easy
len(filtered_lst)

379

## Kindergartens and nurseries and primary schools

In [91]:
##Offices that are within 1 kilometers from a nursery or kindergarten. 
#This time we will be a bit more demanding and make the name include one of a series of words:
kinder_lst = gf.school_kinder_filter(filtered_lst, ['kindergarten', 'nursery', 'primary'],'kinder', 
                                  radius=1000, typ='kindergarten', keyw='kindergarten', apiKey=apiKey)

In [92]:
len(kinder_lst)

201

In [95]:
#For those with kids a bit older, let's make sure that they also have a primary school nearby
school_lst = gf.school_kinder_filter(kinder_lst, ['school', 'primary'],'school', 
                                  radius=1000, typ='school', keyw='primary', apiKey=apiKey)

In [96]:
#It looks that kindergartens are way more limiting than primary schools
len(school_lst)

201

## Starbucks


In [97]:
#The API from starbucks does not allow any email domain to register, 
# therefore, I will use actual store lat long from a kaggle dataset

starbuck = pd.read_csv('INPUT/starbucks_stores.csv')
#filtering to stores in england
starbuck = starbuck[starbuck.Country=='GB']

In [98]:
#Using the create_geojson function, I create a location geoJson in this df
starbuck['location'] = starbuck.apply(mf.create_geojson, axis=1)

In [99]:
#Drop the useless columns and export to a json
starbucks = starbuck[['Store Name', 'City', 'location']]
starbucks.to_json("OUTPUT/starbucks.json",orient="records")

In [100]:
!mongoimport --db companies --collection starbucks --drop --jsonArray OUTPUT/starbucks.json

2020-04-20T19:55:10.997+0200	connected to: mongodb://localhost/
2020-04-20T19:55:10.998+0200	dropping: companies.starbucks
2020-04-20T19:55:11.455+0200	901 document(s) imported successfully. 0 document(s) failed to import.


In [101]:
#CReate geoJson index
db.starbucks.create_index([("location", '2dsphere')])

'location_2dsphere'

In [102]:
db.starbucks.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'companies.starbucks'},
 'location_2dsphere': {'v': 2,
  'key': [('location', '2dsphere')],
  'ns': 'companies.starbucks',
  '2dsphereIndexVersion': 3}}

In [119]:
#Filter the offices to those that within 200m from an starbucks

starbucks_lst=[]
for of in school_lst:
    n_strbk = mf.near_starbuck(db.starbucks, of['location'], dist=200)
    if n_strbk > 0:
        of['starbucks'] = n_strbk 
        starbucks_lst.append(of)

In [120]:
len(starbucks_lst)

23

## The vegan CEO

In [121]:
#Make sure the CEO can get her vegan lunch
ceo_lst = []
for of in starbucks_lst:
    n = len(gf.get_google(gf.get_latlong(of), radius=200, typ='restaurant', keyw='vegan', apiKey=apiKey))
    if n > 0:
        of['vegan'] = n 
        ceo_lst.append(of)

In [122]:
len(ceo_lst)

22

## Friday disco party


In [123]:
#Places with night life within 300m
final = []
for of in ceo_lst:
    n = len(gf.get_google(gf.get_latlong(of), radius=200, typ='bar', keyw='night-club', apiKey=apiKey))
    if n > 0:
        of['disco'] = n 
        final.append(of)

In [124]:
len(final)

18

**We are left with a total of 15 candidates for the location of our new office**

In [169]:
#Now we can sort the values according our findings following the priorities
final_df = pd.DataFrame(final)
final_df=final_df.drop('_id', axis=1)
final_df = final_df.sort_values(by=['pro_nearby', 
                                    'airp_nearby', 'kinder',
                                    'school', 'starbucks','vegan','disco'], ascending=False).reset_index()
final_df.head()


Unnamed: 0,level_0,index,name,founded_year,category_code,raised,city,location,pro_nearby,airp_nearby,kinder,school,starbucks,disco,vegan
0,6,397,Big Property Ladder,1995.0,other,0.0,London.,"{'type': 'Point', 'coordinates': [-0.1392765, ...",37,7,1,6,8,10,5
1,9,785,Quick TV,2007.0,enterprise,3.613,London,"{'type': 'Point', 'coordinates': [-0.1392765, ...",37,7,1,6,8,10,5
2,14,1070,Eglue Business Technologies,2001.0,software,27.0,London,"{'type': 'Point', 'coordinates': [-0.1392765, ...",37,7,1,6,8,10,5
3,8,590,Box UK,1998.0,software,0.0,London,"{'type': 'Point', 'coordinates': [-0.1392447, ...",37,7,1,6,8,9,5
4,0,27,Seedcamp,2007.0,finance,5.45,London,"{'type': 'Point', 'coordinates': [-0.1418973, ...",37,7,1,5,8,7,4


In [110]:
m = folium.Map(location=[53.920779, -1.70759],
               zoom_start=5,
              tiles='OpenStreet Map'#'Stamen Terrain'
              )


tooltip = 'City'

for e in final_df.iterrows():
    html = f"""
    <strong> City : #{e[1][6]}</strong>
    """
    long= e[1][7]['coordinates'][-1]
    lat=e[1][7]['coordinates'][0]
    folium.Marker([long, lat], 
                  popup=html, 
                  tooltip=tooltip, 
                  icon=folium.Icon(icon='home',color='blue')).add_to(m)


m.add_child(folium.LatLngPopup())

In [111]:
final_df.to_json("OUTPUT/final.json",orient="records")

## Scoring

In [None]:
#All the 15 locations pass all the filters, however, variability is important, and therefore, 
# those that have only one of one class will be penalized. 

In [171]:
final_df['score'] = final_df.apply(fn.score, axis=1)

In [172]:
final_df=final_df.drop('level_0', axis=1)
final_df= final_df.sort_values(by='score', ascending=False).reset_index()
final_df['location'][0]

{'type': 'Point', 'coordinates': [-0.1597092, 51.519772]}

In [146]:
max_score=final_df['score'].max()
mean_score=final_df['score'].mean()
print(max_score, mean_score)

13.410999999999998 7.852666666666668


In [141]:
#Using geocode api to get the actual address of the top score
res = requests.get("https://geocode.xyz//51.5115309,-0.1392765?geoit=xml",params={"json":1})


In [142]:
address = res.json()
print('The selected location is in: ',address['staddress'], address['city'], address['country'], address['postal'])

The selected location is in:  Regent Street London United Kingdom W1B4DA


## Visualizing

In [173]:
m = folium.Map(location=[51.5115309, -0.1392765],
               zoom_start=13,
              tiles='OpenStreet Map'#'Stamen Terrain'
              )


tooltip = 'Click for info'

for e in final_df.iterrows():
    html = f"""
    <strong> Score : #{e[1][15]}</strong>
    <p>Pro_companies:{e[1][8]}</p>
    <p>airports:{e[1][9]}</p>
    <p>kinder:{e[1][10]}</p>
    <p>schools:{e[1][11]}</p>
    <p>starbucks:{e[1][12]}</p>
    <p>vegan:{e[1][13]}</p>
    <p>discos:{e[1][14]}</p>
    """
    long= e[1][7]['coordinates'][-1]
    lat=e[1][7]['coordinates'][0]
    if e[1][15]==max_score:
        color='green'
    elif e[1][15]>mean_score: color='blue'
    else: color='red'
    folium.Marker([long, lat], 
                  popup=html, 
                  tooltip=tooltip, 
                  icon=folium.Icon(icon='home',color=color)).add_to(m)

folium.CircleMarker(
    location=[51.519772, -0.1597092],
    radius=50,
    popup='Optimal area',
    color='#3186cc',
    fill=True,
    fill_color='#3186cc'
).add_to(m)


m.add_child(folium.LatLngPopup())


In [174]:
m.save('OUTPUT/html_map.html')