In [1]:
from census import Census
from us import states
from dotenv import load_dotenv
from bson import ObjectId
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='geospatial-data-project')
import pgeocode
import os
cwd = os.getcwd()
import pickle

In [2]:
from pymongo import MongoClient
client = MongoClient()
db = client.get_database('companies_project')

# Feeding companies_usa with census data
## Get zips from USA

In [271]:
empty_to_none = db.companies_usa.update_many({'offices.zip_code':''}, {'$set':{'offices.zip_code':None}})

In [272]:
empty_to_none.raw_result

{'n': 637, 'nModified': 637, 'ok': 1.0, 'updatedExisting': True}

In [317]:
zips = list(db.companies_usa.find({'offices.zip_code':None}, {'_id':1, 'offices.latitude':1, 'offices.longitude':1}))

In [290]:
leftovers_zips_usa = []
no_post_code = []

In [297]:
for zip in zips:
    try:
        latitude = zip['offices']['latitude']
        longitude = zip['offices']['longitude']
        id = ObjectId(zip['_id'])
        location = geolocator.reverse(f"{latitude},{longitude}")
        db.companies_usa.update_one({'_id':id}, {'$set':{'offices.zip_code': location.raw['address']['postcode']}})
        leftovers_zips_usa.append({'_id':id, 'loc_params':location.raw})
    except:
        no_post_code.append({'_id':id, 'coord':{'latitude':latitude, 'longitude':longitude}})

In [315]:
cwd = os.getcwd()
with open(cwd+'/data/usa_docs/leftovers_zips_usa.txt', 'wb') as leftov:
    pickle.dump(leftovers_zips_usa, leftov)
with open(cwd+'/data/usa_docs/no_post_codes.txt', 'wb') as no_post:
    pickle.dump(no_post_code, no_post)

## Add demographics to companies_usa

In [88]:
load_dotenv()
census_api_k = os.getenv('census_api_key')
c = Census(census_api_k, year=2018)

In [91]:
usa_all = list(db.companies_usa.find({'offices.zip_code':{'$ne':None}, 'offices.demographics':{'$exists':False}}, {'_id':1, 'offices.zip_code':1}))

In [50]:
usa_demoquerys = []
no_demo_data = []

In [90]:
for comp in usa_all:
    id = ObjectId(comp['_id'])
    zip_code = comp['offices']['zip_code']
    try:
        demographics = {'age': 
                            {'median_age': c.acs5dp.zipcode('DP05_0018E', zip_code)[0]['DP05_0018E'], 
                            'range_25_34': c.acs5dp.zipcode('DP05_0010PE', zip_code)[0]['DP05_0010PE'], 
                            'range_35_44': c.acs5dp.zipcode('DP05_0011PE', zip_code)[0]['DP05_0011PE']},
                        'school': 
                            {'kinder_enroll': c.acs5dp.zipcode('DP02_0054E', zip_code)[0]['DP02_0054E'], 
                            'presch_enroll': c.acs5dp.zipcode('DP02_0053E', zip_code)[0]['DP02_0053E'], 
                            'school_enroll': c.acs5dp.zipcode('DP02_0052E', zip_code)[0]['DP02_0052E'], 
                            'element_enroll': c.acs5dp.zipcode('DP02_0055E', zip_code)[0]['DP02_0055E'], 
                            'highsch_enroll': c.acs5dp.zipcode('DP02_0056E', zip_code)[0]['DP02_0056E']},
                        'school%': 
                            {'kinder_enrollP': c.acs5dp.zipcode('DP02_0054PE', zip_code)[0]['DP02_0054PE'], 
                            'presch_enrollP': c.acs5dp.zipcode('DP02_0053PE', zip_code)[0]['DP02_0053PE'], 
                            'school_enrollP': c.acs5dp.zipcode('DP02_0052PE', zip_code)[0]['DP02_0052PE'], 
                            'element_enrollP': c.acs5dp.zipcode('DP02_0055PE', zip_code)[0]['DP02_0055PE'], 
                            'highsch_enrollP': c.acs5dp.zipcode('DP02_0056PE', zip_code)[0]['DP02_0056PE']},
                        'educ_lvl%': 
                            {'highschP': c.acs5dp.zipcode('DP02_0066PE', zip_code)[0]['DP02_0066PE'], 
                            'bachelP': c.acs5dp.zipcode('DP02_0067PE', zip_code)[0]['DP02_0067PE']} 
                        }
        db.companies_usa.update_one({'_id':id}, {'$set':{'offices.demographics': demographics}})
        usa_demoquerys.append({'_id':id, 'demographics': demographics})
    except:
        no_demo_data.append({'_id':id, 'zip_code':zip_code})

In [92]:
#Change the name
with open(cwd+'/data/usa_docs/usa_demo_responses.txt', 'wb') as demoresp:
    pickle.dump(usa_demoquerys, demoresp)
with open(cwd+'/data/usa_docs/usa_demo_responses_fail.txt', 'wb') as demorespf:
    pickle.dump(no_demo_data, demorespf)

## Get zips from Europe

In [14]:
europe_all = list(db.companies_europe.find({}, {'_id':1, 'offices.latitude':1, 'offices.longitude':1}))

In [18]:
eu_location_responses = []

In [21]:
for comp in europe_all:
    latitude = comp['offices']['latitude']
    longitude = comp['offices']['longitude']
    id = ObjectId(comp['_id'])
    location = geolocator.reverse(f"{latitude},{longitude}")
    db.companies_europe.update_one({'_id':id}, {'$set':{'offices.more_location': location.raw}})
    eu_location_responses.append({'_id':id, 'loc_params':location.raw})

In [29]:
with open(cwd+'/data/eurostat_docs/eu_loc_responses.txt', 'wb') as locresp:
    pickle.dump(eu_location_responses, locresp)