In [1]:
import os
import sys
import logging as log
import urllib
import zipfile
import pandas as pd
import joblib

# Add source folder
sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("../src/processing/local-geocode"))

# Load constants
from src.constants import DATA_PATH, LOCATION_PATH

# Download geonames data

In [None]:
# download file
url = 'https://download.geonames.org/export/dump/allCountries.zip'
log.info(f'Downloading data from {url}')
geonames_data_path_zip = os.path.join(LOCATION_PATH, "geonames_data_allCountries.zip")
urllib.request.urlretrieve(url, geonames_data_path_zip)
log.info(f'... done')
log.info('Extracting data...')
# extract
with zipfile.ZipFile(geonames_data_path_zip, 'r') as f:
    f.extractall(LOCATION_PATH)
log.info('...done')
# remove zip file
#os.remove(geonames_data_path_zip)

In [None]:
dtypes = {'name': str, 'latitude': float, 'longitude': float, 'country_code': str, 'population': int, 'feature_code': str, 'alternatenames': str,  "feature_class": str, "asciiname": str, "admin1":str, "cc2": str}
geonames_columns = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1', 'admin2', 'admin3', 'admin4', 'population', 'elevation', 'dem', 'timezone', 'modification_date']

geonames_data_path = os.path.join(LOCATION_PATH, "allCountries.txt")
df = pd.read_csv(geonames_data_path, names=geonames_columns, sep='\t', dtype=dtypes, usecols=dtypes.keys())

In [None]:
joblib.dump(df, os.path.join(LOCATION_PATH, "all_locations.pkl"))

# Using Geocode

In [None]:
from geocode.geocode import Geocode

geocoder = Geocode()
geocoder.init()

## single cpu

In [None]:
geocoder.decode("Ich war heute in zurich und habe eine Pizza gegessen")

## multi cpu

In [None]:
locations = ["zurich", "basel", "sarnen", "giswil", "goldach", "gruet", "bregenz"]
geocoder.decode_parallel(locations, num_cpus=4)

In [None]:
geonames = joblib.load("/tmp/local_geocode/geonames.pkl")

places_in_switzerland = []
for place in geonames:
    if place[1] == "CH":
        places_in_switzerland.append(place)

swiss_places = {entry[0] : tuple(entry[1:]) for entry in places_in_switzerland}
global_places = {entry[0] : tuple(entry[1:]) for entry in geonames}

# Save
#joblib.dump(swiss_places, "/mnt/data/location_data/swiss_places.pkl")
#joblib.dump(global_places, "/mnt/data/location_data/global_places.pkl")

# Hierarchical processing of places in Switzerland

In [2]:
df = joblib.load(os.path.join(LOCATION_PATH, "archive", "all_locations.pkl"))

### Filter locations for countries, towns and admin areas and preprocess

In [5]:
population_thres = 200

def is_ascii(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    
log.info('Transforming geonames data...')
log.info('Reading geo data...')
# select places with a population greater zero
df = df[(df.population > population_thres) | (df.feature_code == 'PCLI')]
# get rid of administrative zones without country codes (e.g. "The Commonwealth")
df = df[~df.country_code.isnull()]
# select places with feature class A (admin) and P (place)
df = df[df.feature_class.isin(['A', 'P'])]
# expand alternate names
df.loc[:, 'alternatenames'] = df.alternatenames.str.split(',')
df['is_altname'] = False
_df = df.explode('alternatenames')
_df['name'] = _df['alternatenames']
_df['is_altname'] = True
df = pd.concat([df, _df])
# Remove all names that are floats/ints
df['is_str'] = df.name.apply(lambda s: isinstance(s, str))
df = df[df['is_str']]
# Levels of priority:
# 1) Prioritize large cities (population size > large_city_population_cutoff)
# 2) Admin areas
# 3) Places
# Within each group we will sort according to population size
log.info('Sorting by priority...')
feature_code_priorities = ['A', 'P']
feature_code_priorities = {k: i+1 for i, k in enumerate(feature_code_priorities)}
df['priority'] = df.feature_class.apply(lambda code: feature_code_priorities[code])
# Only allow 2 character names in specific cases
# - Name is non-ascii (e.g. Chinese characters)
# - Is an alternative name for a country (e.g. UK)
# - Is a US state or Canadian province
df['is_ascii'] = df.name.apply(is_ascii)
df['is_country'] = df.feature_code.str.startswith('PCL')
df = df[
        (~df.is_ascii) | 
        (df.name.str.len() > 2) | 
        ((df.name.str.len() == 2) & (df.country_code == 'US')) |
        ((df.name.str.len() == 2) & (df.country_code == 'CA')) |
        ((df.name.str.len() == 2) & (df.is_country))
        ]
# add "US" manually since it's missing in geonames
row_usa = df[df.is_country & (df.name == 'USA')].iloc[0]
row_usa['name'] = 'US'
df = df.append(row_usa)
# sort by priorities and drop name duplicates (this way we will keep only the high priority elements)
df.sort_values(by=['priority', 'population'], ascending=[True, False], inplace=True)
df['name_lower'] = df.name.str.lower()
df = df.drop_duplicates('name_lower', keep='first')
log.info(f'... collected a total of {len(df):,} names of places and countries')


# Save filtered data
#joblib.dump(swiss_data, os.path.join(LOCATION_PATH, "global_location_df.pkl"))
#joblib.dump(df, os.path.join(LOCATION_PATH, "global_location_df.pkl"))

### Extra priority granularity in Switzerland

In [56]:
# Design extra priorities within Switzerland
swiss_data = df[df['country_code'] == 'CH']

# 1. Extract swiss country lvl location data
swiss_country = swiss_data[swiss_data.is_country]

# 2. Extract swiss admin areas
swiss_administrative_areas = swiss_data[
    (swiss_data["feature_class"] == "A") & 
    (swiss_data.is_country == False)
]
swiss_administrative_areas = swiss_administrative_areas.sort_values(by="population", inplace=False)

# 3. Extract swiss towns
swiss_towns = swiss_data[swiss_data["feature_class"] == "P"]

# Combine in list
location_dfs = [swiss_country_lvl, swiss_administrative_areas, swiss_towns]

In [94]:
# Turn data into dictionaries that can be used with keywordsearch
swiss_country = {}
swiss_admin_areas = {}
swiss_towns = {}

swiss_dicts = [swiss_country, swiss_admin_areas, swiss_towns]

for loc_dict, loc_df in zip(swiss_dicts, location_dfs):
    
    for idx, row in loc_df.iterrows():
        loc_dict[row["name"]] = (row.latitude, row.longitude, row.admin1, row.population, row.feature_code)

In [100]:
# Save
joblib.dump(swiss_country, os.path.join(LOCATION_PATH, "swiss_country_dict.pkl"))
joblib.dump(swiss_admin_areas, os.path.join(LOCATION_PATH, "swiss_adminareas_dict.pkl"))
joblib.dump(swiss_towns, os.path.join(LOCATION_PATH, "swiss_towns_dict.pkl"))

['/mnt/data/location_data/swiss_towns_dict.pkl']

# Get list of all places in Switzerland (old)