In [29]:
from pymongo import MongoClient, ASCENDING, DESCENDING
import pandas as pd

In [3]:

dbName = "companies"
mongodbURL = f"mongodb://localhost/{dbName}"
print(mongodbURL)
client = MongoClient(mongodbURL, connectTimeoutMS=10000,serverSelectionTimeoutMS=10000)
db = client.get_database()

mongodb://localhost/companies


# Filtering for offices in England

In [4]:
#Find the all the different country codes
query={'offices': { '$exists': 'true', '$not': {'$size': 0} }}
web_offices = list(db.companies.find(query,{"offices":1,"name":1}))

countries=set()
for i,e in enumerate(web_offices):
    countries.add(e['offices'][0]['country_code'])
#countries

In [5]:
#Start with filtering only for offices in England
query={'offices':{'$elemMatch':{'country_code':'GBR'}}}
gb_offices = list(db.companies.find(query,{"name":1, "offices":1,'total_money_raised':1,'founded_year':1}))

In [10]:
office_df = pd.DataFrame(gb_offices).explode('offices')

In [11]:
office_df.head()

Unnamed: 0,_id,name,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297da0,Babelgum,2007.0,$13.2M,"{'description': '', 'address1': '', 'address2'..."
1,52cdef7c4bab8bd675297da6,SpinVox,,$106M,"{'description': 'Corporate Headquarters', 'add..."
2,52cdef7c4bab8bd675297da8,OpenX,2008.0,$75.5M,"{'description': 'Headquarters', 'address1': '8..."
2,52cdef7c4bab8bd675297da8,OpenX,2008.0,$75.5M,"{'description': 'New York', 'address1': '584 B..."
2,52cdef7c4bab8bd675297da8,OpenX,2008.0,$75.5M,"{'description': 'London', 'address1': '1 Maple..."


In [7]:
office_df['offices'][0]

{'description': '',
 'address1': '',
 'address2': '',
 'zip_code': '',
 'city': 'London',
 'state_code': None,
 'country_code': 'GBR',
 'latitude': 53.344104,
 'longitude': -6.267494}

In [12]:
def extract_loc(x):
    of = x['offices']
    if of["longitude"]==None:
        return (of['country_code'],
           of['city'], 
                None)
    return (of['country_code'],
           of['city'],
           {"type":"Point",
                    "coordinates":[of["longitude"],of["latitude"]]})

In [13]:
offices = office_df.apply(extract_loc,axis=1, result_type="expand")
offices.head()

Unnamed: 0,0,1,2
0,GBR,London,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
1,GBR,Buckinghamshire,
2,USA,Pasadena,"{'type': 'Point', 'coordinates': [-118.1327468..."
2,USA,New York,
2,GBR,London,


In [14]:
#Append to office_df
offices_loc = pd.concat([office_df, offices], axis=1)
offices_loc.columns = ['__id', 'name', 'founded_year', 'raised', 'offices','country', 'city', 'location']
offices_loc = offices_loc[offices_loc.country=='GBR']
offices_loc = offices_loc[~offices_loc.location.isnull()]
print(offices_loc.dtypes)
offices_loc.head(3)


__id             object
name             object
founded_year    float64
raised           object
offices          object
country          object
city             object
location         object
dtype: object


Unnamed: 0,__id,name,founded_year,raised,offices,country,city,location
0,52cdef7c4bab8bd675297da0,Babelgum,2007.0,$13.2M,"{'description': '', 'address1': '', 'address2'...",GBR,London,"{'type': 'Point', 'coordinates': [-6.267494, 5..."
4,52cdef7c4bab8bd675297dcc,AllPeers,2004.0,$0,"{'description': None, 'address1': None, 'addre...",GBR,Oxford,"{'type': 'Point', 'coordinates': [-1.255824, 5..."
5,52cdef7c4bab8bd675297de9,Zamzar,2006.0,$0,"{'description': '', 'address1': '', 'address2'...",GBR,Southampton,"{'type': 'Point', 'coordinates': [-1.3610845, ..."


In [15]:
#As of 18th April 2020:
# Euros: 1.09 US dollars
# Pounds: 1.25 US dollars
def clean_raised(x):
    currency = {'$':1,'£':1.25,'€':1.09}
    a = 0
    if len(x)<3:
        return 0
    for c in currency:
        if c in x:
            a = float(x[1:-1])*currency[c]
    if 'k' in x:
        a = a/1000
    return round(a,3)
        

In [16]:
offices_loc['raised'] = offices_loc.raised.apply(clean_raised)

In [17]:
offices_loc=offices_loc[['name', 'founded_year', 'raised', 'city', 'location']].reset_index()
offices_loc['location'][4]
#offices_loc['location'] = offices_loc['location'].apply(str)

{'type': 'Point', 'coordinates': [-2.250166, 53.473606]}

In [18]:
#Export to json:
offices_loc.to_json("OUTPUT/offices_loc.json",orient="records")

In [19]:
#Incorporating new collection to database and create geoloc index
!mongoimport --db companies --collection offices_loc_england --jsonArray OUTPUT/offices_loc.json


2020-04-18T12:18:12.319+0200	connected to: mongodb://localhost/
2020-04-18T12:18:12.671+0200	611 document(s) imported successfully. 0 document(s) failed to import.


In [22]:
#Reconect to the database
db = client.get_database()

In [31]:
db.list_collection_names()
#db.offices_loc_england.create_index({'location':'2dsphere'})

SyntaxError: invalid syntax (<ipython-input-31-6355d83e9557>, line 2)

In [33]:
db.offices_loc_england.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'companies.offices_loc_england'},
 'location_2dsphere': {'v': 2,
  'key': [('location', '2dsphere')],
  'ns': 'companies.offices_loc_england',
  'background': False,
  '2dsphereIndexVersion': 3}}

## Filtering to accomodate developers: top notch companies (raised 1 mill dollars)

In [44]:
offices = list(db.offices_loc_england.find({}))

In [67]:
len(offices)

611

In [81]:
offices[0]

{'_id': ObjectId('5e9ad3e4d00e49afc8c5b13f'),
 'index': 6,
 'name': 'KickApps',
 'founded_year': 2004.0,
 'raised': 32.0,
 'city': 'London',
 'location': {'type': 'Point', 'coordinates': [-0.1356235, 51.5094731]}}

In [94]:
def near_success_offices(location, dist=5000):
    nmbr_offices = db.offices_loc_england.find({'$and':[{'raised':{'$gte':1}},
                                                        {'location':{'$near':{'$geometry':location,
                                                                              '$maxDistance':dist}}}]
                                               }).count()
    return nmbr_offices

In [95]:
def near_filter(lst, fn):
    filtered_lst=[]
    for of in offices:
        nr = fn(of['location'])
        if nr > 0:
            of['pro_nearby'] = nr 
            filtered_lst.append(of)
    return filtered_lst
        

In [90]:
new_offices = near_filter(offices,near_success_offices)

  after removing the cwd from sys.path.


In [98]:
len(new_offices)

468

In [99]:
new_offices

[{'_id': ObjectId('5e9ad3e4d00e49afc8c5b13f'),
  'index': 6,
  'name': 'KickApps',
  'founded_year': 2004.0,
  'raised': 32.0,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.1356235, 51.5094731]},
  'pro_nearby': 82},
 {'_id': ObjectId('5e9ad3e4d00e49afc8c5b140'),
  'index': 8,
  'name': 'Google',
  'founded_year': 1998.0,
  'raised': 555.0,
  'city': 'Manchester',
  'location': {'type': 'Point', 'coordinates': [-2.250166, 53.473606]},
  'pro_nearby': 2},
 {'_id': ObjectId('5e9ad3e4d00e49afc8c5b141'),
  'index': 11,
  'name': 'ConnectMeAnywhere',
  'founded_year': 2006.0,
  'raised': 0.0,
  'city': 'London W11 2HX',
  'location': {'type': 'Point', 'coordinates': [-0.205349, 51.514935]},
  'pro_nearby': 82},
 {'_id': ObjectId('5e9ad3e4d00e49afc8c5b142'),
  'index': 12,
  'name': 'LiveWorld',
  'founded_year': 1996.0,
  'raised': 0.0,
  'city': 'London',
  'location': {'type': 'Point', 'coordinates': [-0.126236, 51.500152]},
  'pro_nearby': 82},
 {'_id': ObjectId(

In [None]:
#Example of a google api query:
'''
https://maps.googleapis.com/maps/api/place/nearbysearch/json
  ?location=-33.8670522,151.1957362
  &radius=500
  &types=food
  &name=harbour
  &key=YOUR_API_KEY
'''