### Crawl google Places data

In [2]:
import requests
import time
import json
import numpy as np
from elasticsearch import Elasticsearch, client
from util import NW, SE

In [3]:
# Coordinates of a bbox centered in Berlin
berlin_coords = (52.52426800, 13.406290)
# Box is ~30km x 20km. Sample every ~1 km
cnt_x = 60
cnt_y = 40

In [5]:
x = np.linspace(NW[0], SE[0], cnt_x)
y = np.linspace(NW[1], SE[1], cnt_y)

In [104]:
# Relevant types for places related to health and fitness
types = ['park', 'gym', 'doctor', 'health', 
         'hospital', 'pharmacy', 'physiotherapist', 'spa']

In [111]:
base_url = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json'
headers = {'Content-Type': 'application/json'}

In [64]:
# test connection and response
params = {'location': ','.join(map(str, [x[50], y[25]])), 
          'types': '|'.join(types),
          'rankby': 'distance',
          'key': key}
r = requests.get(base_url, headers=headers, params=params)
res = r.json()
print([elem['name'] for elem in res['results']])
while 'next_page_token' in res:
    print('***Got more in this location')
    params['pagetoken'] = res['next_page_token']
    print(params)
    time.sleep(2)
    r = requests.get(base_url, headers=headers, params=params)
    res = r.json()
    print([elem['name'] for elem in res['results']])

['ic! berlin Flagship Store', 'Dr. med. Christoph Nimsgern', 'Brillenwerkstatt Berlin-Mitte', 'Dr.med. Christiane Handrick', 'Spirit Yoga Studio Mitte', 'Sanft wie Seide - Tantra, Tantramassagen und mehr in Berlin', 'dm-drogerie markt', 'Neptun Apotheke', 'Zentrum für Traditionelle Chinesische und Integrative Medizin Berlin', 'Dr. med. dent. Ulrike Heintzenberg', 'Frau Dr. med. Martina Müngersdorf', 'Fit am Rosenthaler Platz', 'Bodystreet Berlin Monbijouplatz', 'Longma 2 Thaimassage', 'Senzera Waxing Studio', 'Peilin - Thaimassage', 'Bikram Yoga Berlin-Mitte', 'Pro Seniore Residenz Vis à vis der Hackeschen Höfe', 'Praxis für Physiotherapie und Handrehabilitation', 'Praxis Annekatrin Tschörtner']
***Got more in this location
{'location': '52.524268,13.40629', 'types': 'park|gym|doctor|health', 'key': 'AIzaSyAU4IlE0ip3wUbQVgrSdtlg5yNjPFd6XOk', 'pagetoken': 'CpQCDgEAAPMHT4P-V04BSm8QmLCqP6i7x8vNGzlGpjp7QglbZTq2dNy1N---3EOp_UUYcBQcpyImvYBp4d-WAOZM-v8hnonHJ3JQvoD1HGGuLItN39Yutj5W2kVMbxFQOQvO

In [161]:
key = 'your_google_developer_key'

def extract_data(place):
    fields = ['geometry', 'id', 'place_id', 'name', 'types', 'rating', 'vicinity']
    mydict = dict((k,v) for (k,v) in place.items() if k in fields)
    mydict['geometry']['location']['lon'] = mydict['geometry']['location']['lng']
    mydict['geometry']['location'].pop('lng', None)
    if 'viewport' in mydict['geometry']:
        for v in ['northeast', 'southwest']:
            mydict['geometry']['viewport'][v]['lon'] = mydict['geometry']['viewport'][v]['lng']
            mydict['geometry']['viewport'][v].pop('lng', None)
    return mydict


print('Runs.....')
for i, xx in enumerate(x):
    for j, yy in enumerate(y):
        print('Trying loc ({}, {}): {}'.format(i, j, ','.join(map(str, [xx, yy]))))
        
        params = {'location': ','.join(map(str, [xx, yy])), 
                  'types': '|'.join(types),
                  'rankby': 'distance',
                  'key': key}
        
        r = requests.get(base_url, headers=headers, params=params)
        if r.status_code != 200:
            print('Error request. Skip this location')
            continue
        res = r.json()
        if not res['results']:
            print(res)
        c = 0
        for elem in res['results']:
            if elem['place_id'] in seen_places_id:
                continue
            seen_places_id.add(elem['place_id'])
            bulk_data.append(extract_data(elem))
            c += 1
        print('Added {} elems'.format(c))
        while 'next_page_token' in res:
            print('***Got more in this location')
            params = {'location': ','.join(map(str, [xx, yy])), 
                      'types': '|'.join(types),
                      'rankby': 'distance',
                      'key': key,
                      'pagetoken': res['next_page_token']}
            time.sleep(2)
            r = requests.get(base_url, headers=headers, params=params)
            if r.status_code != 200:
                print('Error request. Skip this location')
                continue
            res = r.json()
            if not res['results']:
                print(res)
            c = 0
            for elem in res['results']:
                if elem['place_id'] in seen_places_id:
                    continue
                seen_places_id.add(elem['place_id'])
                bulk_data.append(extract_data(elem))
                c += 1
            print('Added {} elems'.format(c))
        print('\n')
                
print('Last i: {}, j: {}'.format(i, j))
print('Found {} places'.format(len(bulk_data)))

Runs.....
Trying loc (35, 0): 52.4435582051,13.2035
Added 0 elems
***Got more in this location
Added 0 elems
***Got more in this location
Added 0 elems


Trying loc (35, 1): 52.4435582051,13.2211241667
Added 0 elems
***Got more in this location
Added 0 elems
***Got more in this location
Added 0 elems


Trying loc (35, 2): 52.4435582051,13.2387483333
Added 0 elems
***Got more in this location
Added 0 elems
***Got more in this location
Added 0 elems


Trying loc (35, 3): 52.4435582051,13.2563725
Added 0 elems
***Got more in this location
Added 0 elems
***Got more in this location
Added 11 elems


Trying loc (35, 4): 52.4435582051,13.2739966667
Added 10 elems
***Got more in this location
Added 14 elems
***Got more in this location
Added 6 elems


Trying loc (35, 5): 52.4435582051,13.2916208333
Added 2 elems
***Got more in this location
Added 15 elems
***Got more in this location
Added 13 elems


Trying loc (35, 6): 52.4435582051,13.309245
Added 13 elems
***Got more in this location
Added 

In [166]:
len(bulk_data)

18232

In [163]:
bulk_data[0]

{'geometry': {'location': {'lat': 52.5847644, 'lon': 13.2114144}},
 'id': '8b793795c49aded166bfa6385c8e571b3b559ae7',
 'name': 'DLRG Wasserrettungsstation Bürgerablage',
 'place_id': 'ChIJVd6Pzu1VqEcRu3VEZvdYLIU',
 'types': ['health', 'point_of_interest', 'establishment'],
 'vicinity': 'Niederneuendorfer Allee 79, Berlin'}

In [208]:
# Postprocess data to store keyword
def post_proc(place, keyword=None):
    types = ['park', 'gym', 'doctor', 'health', 
             'hospital', 'pharmacy', 'physiotherapist', 'spa']
    place['keyword'] = keyword if keyword else [p for p in place['types'] if p in types][0]
    return place

In [211]:
bulk_data = [post_proc(el) for el in bulk_data]
bulk_data[1]

{'geometry': {'location': {'lat': 52.581187, 'lon': 13.212754}},
 'id': '6d128537896cf0fea84c51cdf98489c4e9c3a4bb',
 'keyword': 'park',
 'name': 'Berliner Camping Club e. V. - Platz Bürgerablage',
 'place_id': 'ChIJpWqwV-lVqEcRvJociwxCGIc',
 'types': ['campground',
  'rv_park',
  'park',
  'lodging',
  'point_of_interest',
  'establishment'],
 'vicinity': 'Niederneuendorfer Allee 63, Berlin'}

In [212]:
fo = open('../data/gmap_places_by_type.json', 'w')
fo.write(json.dumps(bulk_data))
fo.close()

In [174]:
# Query with keywords - No need for such fine grid
# Box is ~30km x 20km. Sample every ~1 km
cnt_x = 10
cnt_y = 5
x = np.linspace(nw[0], se[0], cnt_x)
y = np.linspace(nw[1], se[1], cnt_y)

In [192]:
keywords = ['vegan', 'hallenbad', 'freibad', 'schwimmhalle', 'pilates', 'yoga', 'fitness', 
            'crossfit', 'training', 'sportverein', 'skatepark', 'zumba', 'tanzschule', 
            'klettern', 'bouldern', 'kampfsport' ]

In [218]:
bulk_data_kw = []
seen_places_id_kw = seen_places_id

# run a loop over list of keywords
kw = keywords[15]

print('Runs for keyword {}...'.format(kw))
for i, xx in enumerate(x):
    for j, yy in enumerate(y):
        print('Trying loc ({}, {}): {}'.format(i, j, ','.join(map(str, [xx, yy]))))
        
        params = {'location': ','.join(map(str, [xx, yy])), 
                  'keyword': kw,
                  'rankby': 'distance',
                  'key': key}
        
        r = requests.get(base_url, headers=headers, params=params)
        if r.status_code != 200:
            print('Error request. Skip this location')
            continue
        res = r.json()
        #print(res)
        if not res['results']:
            print(res)
        c = 0
        for elem in res['results']:
            if elem['place_id'] in seen_places_id_kw:
                continue
            seen_places_id_kw.add(elem['place_id'])
            bulk_data_kw.append(post_proc(extract_data(elem), kw))
            c += 1
        print('Added {} elems from {} found'.format(c, len(res['results'])))
        while 'next_page_token' in res:
            print('***Got more in this location')
            params = {'location': ','.join(map(str, [xx, yy])), 
                      'types': '|'.join(types),
                      'rankby': 'distance',
                      'key': key,
                      'pagetoken': res['next_page_token']}
            time.sleep(2)
            r = requests.get(base_url, headers=headers, params=params)
            if r.status_code != 200:
                print('Error request. Skip this location')
                continue
            res = r.json()
            if not res['results']:
                print(res)
            c = 0
            for elem in res['results']:
                if elem['place_id'] in seen_places_id_kw:
                    continue
                seen_places_id_kw.add(elem['place_id'])
                bulk_data_kw.append(post_proc(extract_data(elem), kw))
                c += 1
            print('Added {} elems from {} found'.format(c, len(res['results'])))
                
print('Last i: {}, j: {}'.format(i, j))
print('Found {} places'.format(len([el for el in bulk_data_kw if el['keyword']==kw])))

Runs for keyword kampfsport...
Trying loc (0, 0): 52.58363,13.2035
Added 6 elems from 20 found
***Got more in this location
Added 5 elems from 11 found
Trying loc (0, 1): 52.58363,13.309245
Added 0 elems from 20 found
***Got more in this location
Added 3 elems from 16 found
Trying loc (0, 2): 52.58363,13.41499
Added 4 elems from 20 found
***Got more in this location
Added 1 elems from 18 found
Trying loc (0, 3): 52.58363,13.520735
Added 2 elems from 20 found
***Got more in this location
Added 2 elems from 20 found
***Got more in this location
Added 0 elems from 4 found
Trying loc (0, 4): 52.58363,13.62648
Added 1 elems from 20 found
***Got more in this location
Added 1 elems from 20 found
Trying loc (1, 0): 52.5662877778,13.2035
Added 1 elems from 20 found
***Got more in this location
Added 1 elems from 13 found
Trying loc (1, 1): 52.5662877778,13.309245
Added 0 elems from 20 found
***Got more in this location
Added 3 elems from 20 found
***Got more in this location
Added 0 elems from 

In [220]:
len(bulk_data_kw)

1268

In [221]:
fo = open('../data/gmap_data_by_kw.json', 'w')
fo.write(json.dumps(bulk_data_kw))
fo.close()