In [126]:
from pymongo import MongoClient, ASCENDING, DESCENDING
import pandas as pd

In [127]:

dbName = "companies"
mongodbURL = f"mongodb://localhost/{dbName}"
print(mongodbURL)
client = MongoClient(mongodbURL, connectTimeoutMS=10000,serverSelectionTimeoutMS=10000)
db = client.get_database()

mongodb://localhost/companies


# Filtering for offices in England

In [128]:
#Find the all the different country codes
query={'offices': { '$exists': 'true', '$not': {'$size': 0} }}
web_offices = list(db.companies.find(query,{"offices":1,"name":1}))

countries=set()
for i,e in enumerate(web_offices):
    countries.add(e['offices'][0]['country_code'])
#countries

In [148]:
#Start with filtering only for offices in England
query={'offices':{'$elemMatch':{'country_code':'GBR'}}}
project = {"name":1, "offices":1,'total_money_raised':1,'founded_year':1, 'category_code':1}
gb_offices = list(db.companies.find(query,project))

In [149]:
office_df = pd.DataFrame(gb_offices).explode('offices')

In [150]:
office_df.head()

Unnamed: 0,_id,name,category_code,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297da0,Babelgum,games_video,2007.0,$13.2M,"{'description': '', 'address1': '', 'address2'..."
1,52cdef7c4bab8bd675297da6,SpinVox,messaging,,$106M,"{'description': 'Corporate Headquarters', 'add..."
2,52cdef7c4bab8bd675297da8,OpenX,advertising,2008.0,$75.5M,"{'description': 'Headquarters', 'address1': '8..."
2,52cdef7c4bab8bd675297da8,OpenX,advertising,2008.0,$75.5M,"{'description': 'New York', 'address1': '584 B..."
2,52cdef7c4bab8bd675297da8,OpenX,advertising,2008.0,$75.5M,"{'description': 'London', 'address1': '1 Maple..."


In [151]:
office_df['offices'][0]

{'description': '',
 'address1': '',
 'address2': '',
 'zip_code': '',
 'city': 'London',
 'state_code': None,
 'country_code': 'GBR',
 'latitude': 53.344104,
 'longitude': -6.267494}

In [152]:
def extract_loc(x):
    of = x['offices']
    if of["longitude"]==None:
        return (of['country_code'],
           of['city'], 
                None)
    return (of['country_code'],
           of['city'],
           {"type":"Point",
                    "coordinates":[of["longitude"],of["latitude"]]})

In [153]:
offices = office_df.apply(extract_loc,axis=1, result_type="expand")
offices.head()

Unnamed: 0,0,1,2
0,GBR,London,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
1,GBR,Buckinghamshire,
2,USA,Pasadena,"{'type': 'Point', 'coordinates': [-118.1327468..."
2,USA,New York,
2,GBR,London,


In [155]:
#Append to office_df
offices_loc = pd.concat([office_df, offices], axis=1)
offices_loc.columns = ['__id', 'name', 'category_code','founded_year',  'raised', 'offices','country', 'city', 'location']
offices_loc = offices_loc[offices_loc.country=='GBR']
offices_loc = offices_loc[~offices_loc.location.isnull()]
print(offices_loc.dtypes)
offices_loc.head(3)


__id              object
name              object
category_code     object
founded_year     float64
raised            object
offices           object
country           object
city              object
location          object
dtype: object


Unnamed: 0,__id,name,category_code,founded_year,raised,offices,country,city,location
0,52cdef7c4bab8bd675297da0,Babelgum,games_video,2007.0,$13.2M,"{'description': '', 'address1': '', 'address2'...",GBR,London,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
4,52cdef7c4bab8bd675297dcc,AllPeers,web,2004.0,$0,"{'description': None, 'address1': None, 'addre...",GBR,Oxford,"{'type': 'Point', 'coordinates': [-1.255824, 5..."
5,52cdef7c4bab8bd675297de9,Zamzar,web,2006.0,$0,"{'description': '', 'address1': '', 'address2'...",GBR,Southampton,"{'type': 'Point', 'coordinates': [-1.3610845, ..."


In [156]:
#As of 18th April 2020:
# Euros: 1.09 US dollars
# Pounds: 1.25 US dollars
def clean_raised(x):
    currency = {'$':1,'£':1.25,'€':1.09}
    a = 0
    if len(x)<3:
        return 0
    for c in currency:
        if c in x:
            a = float(x[1:-1])*currency[c]
    if 'k' in x:
        a = a/1000
    return round(a,3)
        

In [157]:
offices_loc['raised'] = offices_loc.raised.apply(clean_raised)

In [158]:
offices_loc=offices_loc[['name', 'founded_year', 'category_code', 'raised', 'city', 'location']].reset_index()
offices_loc['location'][4]

{'type': 'Point', 'coordinates': [-2.250166, 53.473606]}

In [159]:
#Export to json:
offices_loc.to_json("OUTPUT/offices_loc.json",orient="records")

In [162]:
#Incorporating new collection to database and create geoloc index
!mongoimport --db companies --collection offices_loc_england --jsonArray OUTPUT/offices_loc.json


2020-04-18T16:00:10.836+0200	connected to: mongodb://localhost/
2020-04-18T16:00:11.239+0200	611 document(s) imported successfully. 0 document(s) failed to import.


In [165]:
db.list_collection_names()
#db.offices_loc_england.create_index({'location':'2dsphere'})

['offices_loc_england', 'companies']

In [166]:
db.offices_loc_england.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'companies.offices_loc_england'}}

## Filtering to accomodate developers: top notch companies (raised 1 mill dollars)

In [44]:
offices = list(db.offices_loc_england.find({}))

In [67]:
len(offices)

611

In [117]:
#Different kind of companies
p = list(db.companies.find({},{'category_code':1}))
category = set()
for e in p:
    category.add(e['category_code'])
#category

In [118]:
def near_success_offices(location, dist=5000):
    nmbr_offices = db.offices_loc_england.find({'$and':[{'category_code':{'$in':['biotech', 'cleantech', 
                                                                                 'games_video', 'mobile', 
                                                                                 'nanotech', 'network_hosting',
                                                                                 'software', 'web']}},
                                                        {'$and':[{'raised':{'$gte':1}},
                                                                        {'location':{'$near':{'$geometry':location,
                                                                                              '$maxDistance':dist}}}]
                                               }]}).count()
    return nmbr_offices

In [94]:
#maybe filter for tech companies (games, web, )
def near_success_offices(location, dist=5000):
    nmbr_offices = db.offices_loc_england.find({'$and':[{'raised':{'$gte':1}},
                                                        {'location':{'$near':{'$geometry':location,
                                                                              '$maxDistance':dist}}}]
                                               }).count()
    return nmbr_offices

In [95]:
def near_filter(lst, fn):
    filtered_lst=[]
    for of in offices:
        nr = fn(of['location'])
        if nr > 0:
            of['pro_nearby'] = nr 
            filtered_lst.append(of)
    return filtered_lst
        

In [119]:
new_offices = near_filter(offices,near_success_offices)
len(new_offices)

  


0

In [120]:
offices[0]

{'_id': ObjectId('5e9ad3e4d00e49afc8c5b13f'),
 'index': 6,
 'name': 'KickApps',
 'founded_year': 2004.0,
 'raised': 32.0,
 'city': 'London',
 'location': {'type': 'Point', 'coordinates': [-0.1356235, 51.5094731]},
 'pro_nearby': 74}

In [123]:
nmbr_offices = db.offices_loc_england.find({'$and':[{'raised':{'$gte':1}},
                                                        {'location':{'$near':{'$geometry':{'type': 'Point', 'coordinates': [-0.1356235, 51.5094731]},
                                                                              '$maxDistance':5000}}}]
                                               })

In [125]:
list(nmbr_offices)

[{'_id': ObjectId('5e9ad3e4d00e49afc8c5b13f'),
  'index': 6,
  'name': 'KickApps',
  'founded_year': 2004.0,
  'raised': 32.0,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.1356235, 51.5094731]}},
 {'_id': ObjectId('5e9ad3e4d00e49afc8c5b2fe'),
  'index': 941,
  'name': 'Abacus e-Media',
  'founded_year': 1977.0,
  'raised': 2.812,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.1335871, 51.5088058]}},
 {'_id': ObjectId('5e9ad3e4d00e49afc8c5b190'),
  'index': 155,
  'name': 'Undertone',
  'founded_year': 2002.0,
  'raised': 40.0,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.132904, 51.5118239]}},
 {'_id': ObjectId('5e9ad3e4d00e49afc8c5b344'),
  'index': 1070,
  'name': 'Eglue Business Technologies',
  'founded_year': 2001.0,
  'raised': 27.0,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.1392765, 51.5115309]}},
 {'_id': ObjectId('5e9ad3e4d00e49afc8c5b2be'),
  'index': 785,
  'name': 'Quick

In [112]:
p

[{'_id': ObjectId('52cdef7c4bab8bd675297d8b'), 'category_code': 'enterprise'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d8c'), 'category_code': 'software'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d8a'), 'category_code': 'web'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d8f'),
  'category_code': 'network_hosting'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d90'), 'category_code': 'web'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d92'), 'category_code': 'games_video'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d91'), 'category_code': 'web'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d8d'), 'category_code': 'news'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d93'), 'category_code': 'web'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d96'), 'category_code': 'web'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d95'), 'category_code': 'web'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d97'), 'category_code': 'news'},
 {'_id': ObjectId('52cdef7c4bab8bd675297d8e'), 'category_code': 'social'},
 {'_id': ObjectId('52cdef

'enterprise'

In [115]:

[]

{None,
 'advertising',
 'analytics',
 'automotive',
 'biotech',
 'cleantech',
 'consulting',
 'design',
 'ecommerce',
 'education',
 'enterprise',
 'fashion',
 'finance',
 'games_video',
 'government',
 'hardware',
 'health',
 'hospitality',
 'legal',
 'local',
 'manufacturing',
 'medical',
 'messaging',
 'mobile',
 'music',
 'nanotech',
 'network_hosting',
 'news',
 'nonprofit',
 'other',
 'photo_video',
 'public_relations',
 'real_estate',
 'search',
 'security',
 'semiconductor',
 'social',
 'software',
 'sports',
 'transportation',
 'travel',
 'web'}

In [None]:
[]

In [None]:
#Example of a google api query:
'''
https://maps.googleapis.com/maps/api/place/nearbysearch/json
  ?location=-33.8670522,151.1957362
  &radius=500
  &types=food
  &name=harbour
  &key=YOUR_API_KEY
'''