## Install Requirements

In [1]:
%%capture

# Install txtai and elasticsearch python client
!pip install git+https://github.com/neuml/txtai elasticsearch

# Download and extract elasticsearch
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.1-linux-x86_64.tar.gz
!tar -xzf elasticsearch-7.8.1-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.8.1

import os
from subprocess import Popen, PIPE, STDOUT

In [10]:
# Start and wait for server
server = Popen(['elasticsearch-7.8.1/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
!sleep 30

In [3]:
# install requirements
!pip install elasticsearch
!pip install metaphone
!pip install pyjarowinkler
!pip install python-Levenshtein
!pip install python-dateutil

Collecting metaphone
  Downloading https://files.pythonhosted.org/packages/d4/ae/c9e4d007e32a6469be212da11d0b8e104d643f6f247d771742caf6ac6bb8/Metaphone-0.6.tar.gz
Building wheels for collected packages: metaphone
  Building wheel for metaphone (setup.py) ... [?25l[?25hdone
  Created wheel for metaphone: filename=Metaphone-0.6-cp37-none-any.whl size=13918 sha256=d7e7cf6edf590a983aacfba4c871d725d199e36ca6f2bbdd5b7b72a48660640d
  Stored in directory: /root/.cache/pip/wheels/4e/7c/f7/162d726fc83491ef23c7a0b989005024eb83a1408c96f32eaf
Successfully built metaphone
Installing collected packages: metaphone
Successfully installed metaphone-0.6
Collecting pyjarowinkler
  Downloading https://files.pythonhosted.org/packages/b9/58/b89073047b447e02b08d4f64fbb984e5a4dfef4134477350b256c625c779/pyjarowinkler-1.8-py2.py3-none-any.whl
Installing collected packages: pyjarowinkler
Successfully installed pyjarowinkler-1.8
Collecting python-Levenshtein
[?25l  Downloading https://files.pythonhosted.org/pac

In [4]:
# import lib for data ingestion
from elasticsearch import Elasticsearch
from argparse import ArgumentParser
import csv,time,logging, json
import pandas as pd
import numpy as np
from elasticsearch import helpers
from metaphone import doublemetaphone
from elasticsearch import exceptions

### Run Elastic Search on census data

In [11]:
# Wall Clock: 1m 24s
import re
es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename="/content/bulk_insert.log",
                            filemode='a',
                            format='%(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('InsertTime')

def ingest(config):
    df = pd.read_csv(config['census_filename'])
    bulk_data = []
    count = 0 
    for itr, row in df.iterrows():
        count+=1
        row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
        data = row.to_dict() #converts the dataframe row to dictonary with their correct data type
        if 'LOCATION' in data:
          data['LOCATION'] = {"lat":data["LAT"],"lon":data["LONG"]} # find location
        
        if 'ADDNUMFROM' in data and type(data['ADDNUMFROM']) is str:
            data['ADDNUMFROM'] = data['ADDNUMFROM'].replace('`','')

        data[config['census_first_name']] = name_clean(data[config['census_first_name']])
        data[config['census_last_name']] = name_clean(data[config['census_last_name']])
        
        if config['metaphone'] is 1: # default metaphone is 1
          data['METAPHONE_NAMEFIRST'] = [i for i in doublemetaphone(data[config['census_first_name']]) if i]
          data['METAPHONE_NAMELAST'] = [i for i in doublemetaphone(data[config['census_last_name']]) if i]
        
        if id is not False:
          meta = {
              "_index": config['es-index'],
              "_id": data[config['es-id']],
              "_source": data
          }
        else:
          meta = {
              "_index": config['es-index'],
              "_source": data
          }

        bulk_data.append(meta)
        if itr%config['ingest_size'] == 0:
            helpers.bulk(es, bulk_data)
            bulk_data = []
            print("INSERTING NOW", itr)
            
    helpers.bulk(es, bulk_data)
    return count

def name_clean(name):
  name = max(name.split(' '), key=len) # remove middle name if any
  name = re.sub('[^A-Za-z0-9]+', '', name)
   
  return name

if __name__=='__main__':
    if __name__ == '__main__':

      with open('/content/config_1850_B.json') as json_data_file:
         config = json.load(json_data_file)

      st = time.time()
      ingest(config)
      end = time.time()
      logger.warning(config["es-index"] +" "+ str(end-st))
 
#Mapping used
'''
PUT census
{
    "mappings" : {
      "properties" : {
        "ADDNUM" : {
          "type" : "long"
        },
        "ADDNUMFROM" : {
          "type" : "long"
        },
        "ADDNUMTO" : {
          "type" : "long"
        },
        "ADDR_TYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ADDRESSB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_AGEB" : {
          "type" : "long"
        },
        "CENSUS_BUILDING_I" : {
          "type" : "long"
        },
        "CENSUS_CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ED" : {
          "type" : "long"
        },
        "CENSUS_ENUMDISTB" : {
          "type" : "long"
        },
        "CENSUS_EXTGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_FID" : {
          "type" : "long"
        },
        "CENSUS_MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_MERGEID" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMEFRSTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMELASTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NEIGHBOR_1" : {
          "type" : "long"
        },
        "CENSUS_NEIGHBOR_2" : {
          "type" : "long"
        },
        "CENSUS_NPERHHB" : {
          "type" : "long"
        },
        "CENSUS_OCCLABELB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_PAGENUMB" : {
          "type" : "long"
        },
        "CENSUS_RACEB" : {
          "type" : "long"
        },
        "CENSUS_RACENAMEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_REELB" : {
          "type" : "long"
        },
        "CENSUS_RELATEB" : {
          "type" : "long"
        },
        "CENSUS_RELATE_STR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_SEGGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_SEGMENT_ID" : {
          "type" : "long"
        },
        "CENSUS_SERIAL" : {
          "type" : "long"
        },
        "CENSUS_SERIALB" : {
          "type" : "long"
        },
        "CENSUS_SEXB" : {
          "type" : "long"
        },
        "CENSUS_STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREET" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREETB" : {
          "type" : "long"
        },
        "CENSUS_TYPEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_UNITTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_VOLUMEB" : {
          "type" : "long"
        },
        "CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "COUNTY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "LOCATION":{
             "type": "geo_point"
        },
        "LAT" : {
          "type" : "float"
        },
        "LONG" : {
          "type" : "float"
        },
        "MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "METAPHONE_NAMEFIRST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "METAPHONE_NAMELAST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "OBJECTID" : {
          "type" : "long"
        },
        "SIDE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STNAME" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPREDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPRETYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "WARD_NUM" : {
          "type" : "long"
        }
      }
    }
  }
'''

INSERTING NOW 0
INSERTING NOW 5000
INSERTING NOW 10000
INSERTING NOW 15000
INSERTING NOW 20000
INSERTING NOW 25000
INSERTING NOW 30000
INSERTING NOW 35000
INSERTING NOW 40000
INSERTING NOW 45000
INSERTING NOW 50000
INSERTING NOW 55000
INSERTING NOW 60000
INSERTING NOW 65000
INSERTING NOW 70000
INSERTING NOW 75000
INSERTING NOW 80000
INSERTING NOW 85000
INSERTING NOW 90000
INSERTING NOW 95000
INSERTING NOW 100000
INSERTING NOW 105000
INSERTING NOW 110000
INSERTING NOW 115000
INSERTING NOW 120000
INSERTING NOW 125000
INSERTING NOW 130000
INSERTING NOW 135000


'\nPUT census\n{\n    "mappings" : {\n      "properties" : {\n        "ADDNUM" : {\n          "type" : "long"\n        },\n        "ADDNUMFROM" : {\n          "type" : "long"\n        },\n        "ADDNUMTO" : {\n          "type" : "long"\n        },\n        "ADDR_TYPE" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256\n            }\n          }\n        },\n        "CENSUS_ADDRESSB" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256\n            }\n          }\n        },\n        "CENSUS_AGEB" : {\n          "type" : "long"\n        },\n        "CENSUS_BUILDING_I" : {\n          "type" : "long"\n        },\n        "CENSUS_CITY" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256

### Match cd and census data

In [12]:
# Wall clock: 54s 

with open('/content/config_1850_B.json') as json_data_file:
    config = json.load(json_data_file)

es = Elasticsearch(host=config['host'], port=config['port'])

def match_addr():
    df = pd.read_csv(config['cd_filename'])
    count, match,unmatch = 0,0,0
    with open(config['match_output_filename'],'w') as fw, open(config['unmatch_output_filename'],'w') as fw2:
        writer = csv.writer(fw, delimiter="\t",quotechar='"')
        columns = config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""
        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))
        for index, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]

            if config['edit_distance'] !=0:
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}

                else:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} }
                            ]}}
            

            if config['edit_distance'] is 0:
                if config['metaphone'] is 1:
                   query =  { "bool" : { "must" : [{ "match": { "CENSUS_FIRST_NAME": data["CD_FIRST_NAME"] } },
                            { "match": { "CENSUS_LAST_NAME": data["CD_LAST_NAME"] } },
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}
            
                else:
                    query = { "bool" : { "must" : [{ "match": { "CENSUS_FIRST_NAME": data["CD_FIRST_NAME"] } },
                            { "match": { "CENSUS_LAST_NAME": data["CD_LAST_NAME"] } }, 
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]}}
                            ]}}

            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
            except exceptions.RequestError:
                print("Exception at row id: ", index)
                continue
            
            if res['hits']['total']['value']!= 0:
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))
                    match+=1
            else:
                fw2.write(str(data['CD_RECORD_ID'])+"\n")
                unmatch+=1
        
    print(count,match,unmatch)

if __name__=='__main__':
    st = time.time()
    match_addr()
    end = time.time()
    print(config["es-index"] +" "+ str(end-st))

Exception at row id:  923
Exception at row id:  1455
Exception at row id:  1707
Exception at row id:  1969
Exception at row id:  2084
Exception at row id:  2425
Exception at row id:  2556
Exception at row id:  2627
Exception at row id:  3162
Exception at row id:  3165
Exception at row id:  3216
Exception at row id:  3496
Exception at row id:  3501
Exception at row id:  3746
Exception at row id:  3774
Exception at row id:  3775
Exception at row id:  3981
Exception at row id:  4034
Exception at row id:  4063
Exception at row id:  4545
Exception at row id:  4551
Exception at row id:  4886
Exception at row id:  4955
Exception at row id:  5039
Exception at row id:  5069
Exception at row id:  5699
Exception at row id:  5827
Exception at row id:  5866
Exception at row id:  5926
Exception at row id:  5944
Exception at row id:  6153
Exception at row id:  6287
Exception at row id:  6333
Exception at row id:  6446
Exception at row id:  7103
Exception at row id:  7104
Exception at row id:  7105
Ex