# install requirements

In [None]:
%%capture

# Install txtai and elasticsearch python client
!pip install git+https://github.com/neuml/txtai elasticsearch

# Download and extract elasticsearch
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.1-linux-x86_64.tar.gz
!tar -xzf elasticsearch-7.8.1-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.8.1

import os
from subprocess import Popen, PIPE, STDOUT

# If issues are encountered with this section, ES can be manually started as follows:
# ./elasticsearch-7.8.1/bin/elasticsearch

# Start and wait for server
server = Popen(['elasticsearch-7.8.1/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
!sleep 30

In [None]:
#!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.3-windows-x86_64.zip

#!unzip elasticsearch-7.9.3-windows-x86_64.zip
#!chown -R daemon:daemon elasticsearch-7.9.3
# start server
#import os
#from subprocess import Popen, PIPE, STDOUT
#es_server = Popen(['elasticsearch-7.9.3/bin/elasticsearch'], 
#                  stdout=PIPE, stderr=STDOUT,
#                  preexec_fn=lambda: os.setuid(1)  # as daemon
#                 )
# wait a bit then test
#!curl -X GET "http://localhost:8888"


install requirements

In [None]:
# install requirements
!pip install elasticsearch
!pip install metaphone
!pip install pyjarowinkler
!pip install python-Levenshtein
!pip install python-dateutil

Collecting metaphone
  Downloading https://files.pythonhosted.org/packages/d4/ae/c9e4d007e32a6469be212da11d0b8e104d643f6f247d771742caf6ac6bb8/Metaphone-0.6.tar.gz
Building wheels for collected packages: metaphone
  Building wheel for metaphone (setup.py) ... [?25l[?25hdone
  Created wheel for metaphone: filename=Metaphone-0.6-cp36-none-any.whl size=13907 sha256=38b60d60cb9c2735c62866d8a7d2d411d8ec10c0594c85b56d796b7fe8897a1c
  Stored in directory: /root/.cache/pip/wheels/4e/7c/f7/162d726fc83491ef23c7a0b989005024eb83a1408c96f32eaf
Successfully built metaphone
Installing collected packages: metaphone
Successfully installed metaphone-0.6
Collecting pyjarowinkler
  Downloading https://files.pythonhosted.org/packages/b9/58/b89073047b447e02b08d4f64fbb984e5a4dfef4134477350b256c625c779/pyjarowinkler-1.8-py2.py3-none-any.whl
Installing collected packages: pyjarowinkler
Successfully installed pyjarowinkler-1.8
Collecting python-Levenshtein
[?25l  Downloading https://files.pythonhosted.org/pac

### import lib for data ingestion

In [None]:
# import lib for data ingestion
from elasticsearch import Elasticsearch
from argparse import ArgumentParser
import csv,time,logging, json
import pandas as pd
import numpy as np
from elasticsearch import helpers
from metaphone import doublemetaphone
from elasticsearch import exceptions

In [None]:
server = Popen(['elasticsearch-7.8.1/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
!sleep 30

### Replicate the work done by previous RAs 

##### data ingestion

In [None]:
es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename="/content/bulk_insert.log",
                            filemode='a',
                            format='%(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('InsertTime')

def ingest(config):
    df = pd.read_csv(config['census_filename'])
    bulk_data = []
    count = 0 
    for itr, row in df.iterrows():
        count+=1
        row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
        data = row.to_dict() #converts the dataframe row to dictonary with their correct data type
        if 'LOCATION' in data:
          data['LOCATION'] = {"lat":data["LAT"],"lon":data["LONG"]} # find location
        
        if 'ADDNUMFROM' in data and type(data['ADDNUMFROM']) is str:
            data['ADDNUMFROM'] = data['ADDNUMFROM'].replace('`','')

        data[config['census_first_name']] = name_clean(data[config['census_first_name']])
        data[config['census_last_name']] = name_clean(data[config['census_last_name']])
        
        if config['metaphone'] is 1: # default metaphone is 1
          data['METAPHONE_NAMEFIRST'] = [i for i in doublemetaphone(data[config['census_first_name']]) if i]
          data['METAPHONE_NAMELAST'] = [i for i in doublemetaphone(data[config['census_last_name']]) if i]
        
        if id is not False:
          meta = {
              "_index": config['es-index'],
              "_id": data[config['es-id']],
              "_source": data
          }
        else:
          meta = {
              "_index": config['es-index'],
              "_source": data
          }

        bulk_data.append(meta)
        if itr%config['ingest_size'] == 0:
            helpers.bulk(es, bulk_data)
            bulk_data = []
            print("INSERTING NOW", itr)
            
    helpers.bulk(es, bulk_data)
    return count

def name_clean(name):
  return max(name.split(' '), key=len)

if __name__=='__main__':
    if __name__ == '__main__':
      #parser = ArgumentParser()
      #parser.add_argument("-config", help="config file path", default="/content/config_1850.json")

      #args = parser.parse_args()

      #with open(args.config) as json_data_file:
      #  config = json.load(json_data_file)
      with open('/content/config_1880.json') as json_data_file:
         config = json.load(json_data_file)


      st = time.time()
      ingest(config)
      end = time.time()
      logger.warning(config["es-index"] +" "+ str(end-st))
 
#Mapping used
'''
PUT census
{
    "mappings" : {
      "properties" : {
        "ADDNUM" : {
          "type" : "long"
        },
        "ADDNUMFROM" : {
          "type" : "long"
        },
        "ADDNUMTO" : {
          "type" : "long"
        },
        "ADDR_TYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ADDRESSB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_AGEB" : {
          "type" : "long"
        },
        "CENSUS_BUILDING_I" : {
          "type" : "long"
        },
        "CENSUS_CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ED" : {
          "type" : "long"
        },
        "CENSUS_ENUMDISTB" : {
          "type" : "long"
        },
        "CENSUS_EXTGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_FID" : {
          "type" : "long"
        },
        "CENSUS_MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_MERGEID" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMEFRSTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMELASTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NEIGHBOR_1" : {
          "type" : "long"
        },
        "CENSUS_NEIGHBOR_2" : {
          "type" : "long"
        },
        "CENSUS_NPERHHB" : {
          "type" : "long"
        },
        "CENSUS_OCCLABELB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_PAGENUMB" : {
          "type" : "long"
        },
        "CENSUS_RACEB" : {
          "type" : "long"
        },
        "CENSUS_RACENAMEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_REELB" : {
          "type" : "long"
        },
        "CENSUS_RELATEB" : {
          "type" : "long"
        },
        "CENSUS_RELATE_STR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_SEGGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_SEGMENT_ID" : {
          "type" : "long"
        },
        "CENSUS_SERIAL" : {
          "type" : "long"
        },
        "CENSUS_SERIALB" : {
          "type" : "long"
        },
        "CENSUS_SEXB" : {
          "type" : "long"
        },
        "CENSUS_STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREET" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREETB" : {
          "type" : "long"
        },
        "CENSUS_TYPEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_UNITTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_VOLUMEB" : {
          "type" : "long"
        },
        "CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "COUNTY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "LOCATION":{
             "type": "geo_point"
        },
        "LAT" : {
          "type" : "float"
        },
        "LONG" : {
          "type" : "float"
        },
        "MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "METAPHONE_NAMEFIRST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "METAPHONE_NAMELAST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "OBJECTID" : {
          "type" : "long"
        },
        "SIDE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STNAME" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPREDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPRETYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "WARD_NUM" : {
          "type" : "long"
        }
      }
    }
  }
'''

##### matching

edit dist 0: match only on ward no and metaphone - 125k cd records matched to 873k census records

63k cd records remain unmatched

In [None]:
# edit dist 0: match only on ward no and metaphone
with open('/content/config_1880.json') as json_data_file:
    config = json.load(json_data_file)


es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

config['match_output_filename'] = "/content/test_1880mn_old.csv"
config['unmatch_output_filename'] = "/content/test-u_1880mn_old.csv"

logging.basicConfig(filename='/content/direct_match.log',
                            filemode='a',
                            format='%(created)f %(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('directMatch')

def get_matches():
    df = pd.read_csv(config['cd_filename'])
    count_match, count_unmatch=0,0

    with open(config['match_output_filename'],'w') as fw, open(config['unmatch_output_filename'],'w') as fw2:
        writer = csv.writer(fw, delimiter="\t")
        columns = config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""

        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))
        
        for idx, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
        
            data["CD_FIRST_NAME"] = name_clean(data["CD_FIRST_NAME"])
            data["CD_LAST_NAME"] = name_clean(data["CD_LAST_NAME"])
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]

            if config['edit_distance'] !=0: # edit distance is used to find matches
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                                    { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                                    { "match" : { "CENSUS_ED": data["CD_ED"]} },
                                    {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                                    {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]
                            }}

                else:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                                    { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                                    { "match" : { "CENSUS_ED": data["CD_ED"]}}
                                    ],
                            }}
            

            if config['edit_distance'] is 0:
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [ { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            #{ "match" : { "CENSUS_ED": data["CD_ED"]} }, #add this back in once ed column data is fixed
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}
                            ]}}
            
                else:
                    query = { "bool" : { "must" : [{ "match": { "CENSUS_FIRST_NAME": data["CD_FIRST_NAME"] } },
                            { "match": { "CENSUS_LAST_NAME": data["CD_LAST_NAME"] } },
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            { "match" : { "CENSUS_ED": data["CD_ED"]} }
                            ]}}

            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
            except exceptions.RequestError as e:
                print(idx)
                continue
        
            if res['hits']['total']['value']!= 0:
                count_match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))

            else:
                fw2.write(str(data['CD_RECORD_ID'])+"\n")
                count_unmatch+=1


    logging.warning("Total city directory matched: "+ str(count_match))
    logging.warning("Total city directory unmatched: "+ str(count_unmatch))

    print(count_match,count_unmatch)

def name_clean(name):
  return max(name.split(' '), key=len)

def export_data(data):
    json.dump(data, open('/content/matched_data.json','w'))

if __name__=='__main__':
    get_matches()

125819 63476


### change diff conditions to see if matches improve

##### edit distance 0, match on ward no, metaphone, first name and last name - 8k cd records matched

In [None]:
# edit distance 0, match on ward no, metaphone, first name and last name
with open('/content/config_1880.json') as json_data_file:
    config = json.load(json_data_file)


es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename='/content/direct_match.log',
                            filemode='a',
                            format='%(created)f %(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('directMatch')

def get_matches():
    df = pd.read_csv(config['cd_filename'])
    count_match, count_unmatch=0,0

    with open(config['match_output_filename'],'w') as fw, open(config['unmatch_output_filename'],'w') as fw2:
        writer = csv.writer(fw, delimiter="\t")
        columns = config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""

        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))
        
        for idx, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
        
            data["CD_FIRST_NAME"] = name_clean(data["CD_FIRST_NAME"])
            data["CD_LAST_NAME"] = name_clean(data["CD_LAST_NAME"])
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]


            if config['edit_distance'] is 0:
                if config['metaphone'] is 1:
                   
                    query =  { "bool" : { "must" : [{ "match": { "CENSUS_FIRST_NAME": data["CD_FIRST_NAME"] } },
                            { "match": { "CENSUS_LAST_NAME": data["CD_LAST_NAME"] } },
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}
                    """
                    query = { "bool" : { "must" : [ { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            { "match" : { "CENSUS_ED": data["CD_ED"]} }, #add this back in once ed column data is fixed
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}
                            ]}}
                    """      
            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
            except exceptions.RequestError as e:
                print(idx)
                continue
        
            if res['hits']['total']['value']!= 0:
                count_match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))

            else:
                fw2.write(str(data['CD_RECORD_ID'])+"\n")
                count_unmatch+=1


    logging.warning("Total city directory matched: "+ str(count_match))
    logging.warning("Total city directory unmatched: "+ str(count_unmatch))

    print(count_match,count_unmatch)

def name_clean(name):
  return max(name.split(' '), key=len)

def export_data(data):
    json.dump(data, open('/content/matched_data.json','w'))

if __name__=='__main__':
    get_matches()

  if self.run_code(code, result):


8975 180320


##### match only on metaphone - 36k cd records match

In [None]:
# match only on metaphone
with open('/content/config_1880.json') as json_data_file:
    config = json.load(json_data_file)


es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename='/content/direct_match.log',
                            filemode='a',
                            format='%(created)f %(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('directMatch')

config['edit_distance'] = 2

def get_matches():
    df = pd.read_csv(config['cd_filename'])
    count_match, count_unmatch=0,0

    with open(config['match_output_filename'],'w') as fw, open(config['unmatch_output_filename'],'w') as fw2:
        writer = csv.writer(fw, delimiter="\t")
        columns = config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""

        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))
        
        for idx, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
        
            data["CD_FIRST_NAME"] = name_clean(data["CD_FIRST_NAME"])
            data["CD_LAST_NAME"] = name_clean(data["CD_LAST_NAME"])
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]

            if config['edit_distance'] !=0: # edit distance is used to find matches
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [
                                    {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                                    {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]
                            }}

            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
            except exceptions.RequestError as e:
                print(idx)
                continue
        
            if res['hits']['total']['value']!= 0:
                count_match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))

            else:
                fw2.write(str(data['CD_RECORD_ID'])+"\n")
                count_unmatch+=1


    logging.warning("Total city directory matched: "+ str(count_match))
    logging.warning("Total city directory unmatched: "+ str(count_unmatch))

    print(count_match,count_unmatch)

def name_clean(name):
  return max(name.split(' '), key=len)

def export_data(data):
    json.dump(data, open('/content/matched_data.json','w'))

if __name__=='__main__':
    get_matches()

  if self.run_code(code, result):


36922 152373


##### edit distance 2, match on ward no, metaphone, first name and last name - 105k cd records matched to 354k census records and 83k cd records non match

In [None]:
# edit distance 2, match on ward no, metaphone, first name and last name
with open('/content/config_1880.json') as json_data_file:
    config = json.load(json_data_file)


es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename='/content/direct_match.log',
                            filemode='a',
                            format='%(created)f %(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('directMatch')

config['edit_distance'] = 2

def get_matches():
    df = pd.read_csv(config['cd_filename'])
    count_match, count_unmatch=0,0

    with open(config['match_output_filename'],'w') as fw, open(config['unmatch_output_filename'],'w') as fw2:
        writer = csv.writer(fw, delimiter="\t")
        columns = config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""

        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))
        
        for idx, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
        
            data["CD_FIRST_NAME"] = name_clean(data["CD_FIRST_NAME"])
            data["CD_LAST_NAME"] = name_clean(data["CD_LAST_NAME"])
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]

            if config['edit_distance'] !=0: # edit distance is used to find matches
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                                    { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                                    #{ "match" : { "CENSUS_ED": data["CD_ED"]} },
                                    {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                                    {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]
                            }}


            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
            except exceptions.RequestError as e:
                print(idx)
                continue
        
            if res['hits']['total']['value']!= 0:
                count_match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))

            else:
                fw2.write(str(data['CD_RECORD_ID'])+"\n")
                count_unmatch+=1


    logging.warning("Total city directory matched: "+ str(count_match))
    logging.warning("Total city directory unmatched: "+ str(count_unmatch))

    print(count_match,count_unmatch)

def name_clean(name):
  return max(name.split(' '), key=len)

def export_data(data):
    json.dump(data, open('/content/matched_data.json','w'))

if __name__=='__main__':
    get_matches()

105530 83765


##### edit distance 2, match on ward no, ed, metaphone, first name and last name - 79k cd records matched with 102k census data and 104k unmatched

In [None]:
# edit distance 2, match on ward no, metaphone, ed, first name and last name
with open('/content/config_1880.json') as json_data_file:
    config = json.load(json_data_file)


es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename='/content/direct_match.log',
                            filemode='a',
                            format='%(created)f %(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('directMatch')

config['edit_distance'] = 2

def get_matches():
    df = pd.read_csv(config['cd_filename'])
    count_match, count_unmatch=0,0

    with open(config['match_output_filename'],'w') as fw, open(config['unmatch_output_filename'],'w') as fw2:
        writer = csv.writer(fw, delimiter="\t")
        columns = config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""

        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))
        
        for idx, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
        
            data["CD_FIRST_NAME"] = name_clean(data["CD_FIRST_NAME"])
            data["CD_LAST_NAME"] = name_clean(data["CD_LAST_NAME"])
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]

            if config['edit_distance'] !=0: # edit distance is used to find matches
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                                    { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                                    { "match" : { "CENSUS_ED": data["CD_ED"]} },
                                    {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                                    {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]
                            }}


            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
            except exceptions.RequestError as e:
                print(idx)
                continue
        
            if res['hits']['total']['value']!= 0:
                count_match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))

            else:
                fw2.write(str(data['CD_RECORD_ID'])+"\n")
                count_unmatch+=1


    logging.warning("Total city directory matched: "+ str(count_match))
    logging.warning("Total city directory unmatched: "+ str(count_unmatch))

    print(count_match,count_unmatch)

def name_clean(name):
  return max(name.split(' '), key=len)

def export_data(data):
    json.dump(data, open('/content/matched_data.json','w'))

if __name__=='__main__':
    get_matches()

135676
135759
135765
135767
135936
135937
135961
135974
135995
136000
136006
136012
136020
136031
136044
136068
136087
136136
136197
136279
136293
136404
136451
136456
136531
136537
136574
136581
136643
136663
136749
136784
136795
136836
136856
136872
136976
136996
137008
137033
137096
137157
137213
137230
137454
137459
137491
137495
137541
137543
137585
137590
137622
137635
137654
137662
137672
137707
137777
137810
137811
137814
137819
137822
137828
137905
137937
137970
137981
138002
138034
138122
138127
138154
138172
138206
138225
138269
138277
138316
138393
138409
138477
138581
138582
138584
138643
138659
138717
138867
138909
138923
138946
138988
139009
139016
139047
139071
139120
139122
139164
139260
139265
139350
139366
139369
139410
139429
139435
139438
139452
139464
139504
139568
139583
139616
139625
139631
139642
139699
139730
139755
139756
139762
139776
139778
139785
139848
139921
139926
139944
139962
140001
140037
140054
140105
140138
140198
140221
140243
140267
140306
140317

##### edit dist 0: match on ed, ward no and metaphone - 91k cd records match with 129k census records and 92k unmatched

In [None]:
# edit dist 0: match on ed, ward no and metaphone
with open('/content/config_1880.json') as json_data_file:
    config = json.load(json_data_file)


es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename='/content/direct_match.log',
                            filemode='a',
                            format='%(created)f %(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('directMatch')
#config['edit_distance'] = 0

def get_matches():
    df = pd.read_csv(config['cd_filename'])
    count_match, count_unmatch=0,0

    with open(config['match_output_filename'],'w') as fw, open(config['unmatch_output_filename'],'w') as fw2:
        writer = csv.writer(fw, delimiter="\t")
        columns = config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""

        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))
        
        for idx, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
        
            data["CD_FIRST_NAME"] = name_clean(data["CD_FIRST_NAME"])
            data["CD_LAST_NAME"] = name_clean(data["CD_LAST_NAME"])
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]



            if config['edit_distance'] !=0: # edit distance is used to find matches
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                                    { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                                    { "match" : { "CENSUS_ED": data["CD_ED"]} },
                                    {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                                    {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]
                            }}

                else:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                                    { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                                    { "match" : { "CENSUS_ED": data["CD_ED"]}}
                                    ],
                            }}
            

            if config['edit_distance'] is 0:
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [ { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            { "match" : { "CENSUS_ED": data["CD_ED"]} }, #add this back in once ed column data is fixed
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}
                            ]}}
            
                else:
                    query = { "bool" : { "must" : [{ "match": { "CENSUS_FIRST_NAME": data["CD_FIRST_NAME"] } },
                            { "match": { "CENSUS_LAST_NAME": data["CD_LAST_NAME"] } },
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            { "match" : { "CENSUS_ED": data["CD_ED"]} }
                            ]}}

            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
            except exceptions.RequestError as e:
                print(idx)
                continue
        
            if res['hits']['total']['value']!= 0:
                count_match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))

            else:
                fw2.write(str(data['CD_RECORD_ID'])+"\n")
                count_unmatch+=1


    logging.warning("Total city directory matched: "+ str(count_match))
    logging.warning("Total city directory unmatched: "+ str(count_unmatch))

    print(count_match,count_unmatch)

def name_clean(name):
  return max(name.split(' '), key=len)

def export_data(data):
    json.dump(data, open('/content/matched_data.json','w'))

if __name__=='__main__':
    get_matches()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
9340
9379
9394
9424
9426
9465
9490
9492
9513
9516
9520
9531
9536
9546
9568
9611
9612
9721
9758
9811
9882
9889
9922
9991
10113
10129
10152
10173
10183
10192
10248
10279
10280
10289
10336
10350
10353
10497
10513
10558
10563
10624
10633
10683
10739
10759
10760
10781
10827
10890
10907
10931
10946
11005
11039
11136
11138
11166
11180
11246
11291
11307
11342
11356
11359
11360
11400
11406
11408
11410
11555
11672
11709
11729
11744
11762
11764
11845
11859
11887
11943
11948
11949
11969
12021
12041
12052
12061
12097
12148
12186
12331
12475
12496
12520
12596
12600
12630
12648
12670
12713
12725
12772
12783
12790
12839
12840
13081
13134
13142
13145
13229
13346
13363
13396
13430
13462
13532
13535
13558
13566
13592
13625
13664
13718
13752
13773
13774
13791
13920
14001
14023
14094
14121
14170
14203
14256
14279
14294
14333
14337
14341
14344
14415
14471
14561
14598
14794
14823
14835
15058
15086
15129
15155
15173
15174
15192
15212
15245
15307

### changes to  1880 data ingestion and match process
1. name cleaning - remove few identfied special characters. can be more which went unrecognized

2. durng matching used multiple criteria:
        first match on all conditions: first name, last name, ward no, ed and metaphones
        
        for remaining unmatched records match with ward no, ed and metaphones only and remove match condition for first name and last name

#### data ingestion

In [None]:
import re
es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename="/content/bulk_insert.log",
                            filemode='a',
                            format='%(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('InsertTime')

def ingest(config):
    df = pd.read_csv(config['census_filename'])
    bulk_data = []
    count = 0 
    for itr, row in df.iterrows():
        count+=1
        row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
        data = row.to_dict() #converts the dataframe row to dictonary with their correct data type
        if 'LOCATION' in data:
          data['LOCATION'] = {"lat":data["LAT"],"lon":data["LONG"]} # find location
        
        if 'ADDNUMFROM' in data and type(data['ADDNUMFROM']) is str:
            data['ADDNUMFROM'] = data['ADDNUMFROM'].replace('`','')

        data[config['census_first_name']] = name_clean(data[config['census_first_name']])
        data[config['census_last_name']] = name_clean(data[config['census_last_name']])
        
        if config['metaphone'] is 1: # default metaphone is 1
          data['METAPHONE_NAMEFIRST'] = [i for i in doublemetaphone(data[config['census_first_name']]) if i]
          data['METAPHONE_NAMELAST'] = [i for i in doublemetaphone(data[config['census_last_name']]) if i]
        
        if id is not False:
          meta = {
              "_index": config['es-index'],
              "_id": data[config['es-id']],
              "_source": data
          }
        else:
          meta = {
              "_index": config['es-index'],
              "_source": data
          }

        bulk_data.append(meta)
        if itr%config['ingest_size'] == 0:
            helpers.bulk(es, bulk_data)
            bulk_data = []
            print("INSERTING NOW", itr)
            
    helpers.bulk(es, bulk_data)
    return count

def name_clean(name):
  name = max(name.split(' '), key=len) # remove middle name if any
  name = re.sub('[^A-Za-z0-9]+', '', name)
   
  return name

if __name__=='__main__':
    if __name__ == '__main__':
      #parser = ArgumentParser()
      #parser.add_argument("-config", help="config file path", default="/content/config_1850.json")

      #args = parser.parse_args()

      #with open(args.config) as json_data_file:
      #  config = json.load(json_data_file)
      with open('/content/config_1880.json') as json_data_file:
         config = json.load(json_data_file)


      st = time.time()
      ingest(config)
      end = time.time()
      logger.warning(config["es-index"] +" "+ str(end-st))
 
#Mapping used
'''
PUT census
{
    "mappings" : {
      "properties" : {
        "ADDNUM" : {
          "type" : "long"
        },
        "ADDNUMFROM" : {
          "type" : "long"
        },
        "ADDNUMTO" : {
          "type" : "long"
        },
        "ADDR_TYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ADDRESSB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_AGEB" : {
          "type" : "long"
        },
        "CENSUS_BUILDING_I" : {
          "type" : "long"
        },
        "CENSUS_CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ED" : {
          "type" : "long"
        },
        "CENSUS_ENUMDISTB" : {
          "type" : "long"
        },
        "CENSUS_EXTGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_FID" : {
          "type" : "long"
        },
        "CENSUS_MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_MERGEID" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMEFRSTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMELASTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NEIGHBOR_1" : {
          "type" : "long"
        },
        "CENSUS_NEIGHBOR_2" : {
          "type" : "long"
        },
        "CENSUS_NPERHHB" : {
          "type" : "long"
        },
        "CENSUS_OCCLABELB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_PAGENUMB" : {
          "type" : "long"
        },
        "CENSUS_RACEB" : {
          "type" : "long"
        },
        "CENSUS_RACENAMEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_REELB" : {
          "type" : "long"
        },
        "CENSUS_RELATEB" : {
          "type" : "long"
        },
        "CENSUS_RELATE_STR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_SEGGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_SEGMENT_ID" : {
          "type" : "long"
        },
        "CENSUS_SERIAL" : {
          "type" : "long"
        },
        "CENSUS_SERIALB" : {
          "type" : "long"
        },
        "CENSUS_SEXB" : {
          "type" : "long"
        },
        "CENSUS_STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREET" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREETB" : {
          "type" : "long"
        },
        "CENSUS_TYPEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_UNITTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_VOLUMEB" : {
          "type" : "long"
        },
        "CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "COUNTY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "LOCATION":{
             "type": "geo_point"
        },
        "LAT" : {
          "type" : "float"
        },
        "LONG" : {
          "type" : "float"
        },
        "MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "METAPHONE_NAMEFIRST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "METAPHONE_NAMELAST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "OBJECTID" : {
          "type" : "long"
        },
        "SIDE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STNAME" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPREDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPRETYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "WARD_NUM" : {
          "type" : "long"
        }
      }
    }
  }
'''

  if self.run_code(code, result):


INSERTING NOW 0
INSERTING NOW 10000
INSERTING NOW 20000
INSERTING NOW 30000
INSERTING NOW 40000
INSERTING NOW 50000
INSERTING NOW 60000
INSERTING NOW 70000
INSERTING NOW 80000
INSERTING NOW 90000
INSERTING NOW 100000
INSERTING NOW 110000
INSERTING NOW 120000
INSERTING NOW 130000
INSERTING NOW 140000
INSERTING NOW 150000
INSERTING NOW 160000
INSERTING NOW 170000
INSERTING NOW 180000
INSERTING NOW 190000
INSERTING NOW 200000
INSERTING NOW 210000
INSERTING NOW 220000
INSERTING NOW 230000
INSERTING NOW 240000
INSERTING NOW 250000
INSERTING NOW 260000
INSERTING NOW 270000
INSERTING NOW 280000
INSERTING NOW 290000
INSERTING NOW 300000
INSERTING NOW 310000
INSERTING NOW 320000
INSERTING NOW 330000
INSERTING NOW 340000
INSERTING NOW 350000
INSERTING NOW 360000
INSERTING NOW 370000
INSERTING NOW 380000
INSERTING NOW 390000
INSERTING NOW 400000
INSERTING NOW 410000
INSERTING NOW 420000
INSERTING NOW 430000
INSERTING NOW 440000
INSERTING NOW 450000
INSERTING NOW 460000
INSERTING NOW 470000
INSERT

'\nPUT census\n{\n    "mappings" : {\n      "properties" : {\n        "ADDNUM" : {\n          "type" : "long"\n        },\n        "ADDNUMFROM" : {\n          "type" : "long"\n        },\n        "ADDNUMTO" : {\n          "type" : "long"\n        },\n        "ADDR_TYPE" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256\n            }\n          }\n        },\n        "CENSUS_ADDRESSB" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256\n            }\n          }\n        },\n        "CENSUS_AGEB" : {\n          "type" : "long"\n        },\n        "CENSUS_BUILDING_I" : {\n          "type" : "long"\n        },\n        "CENSUS_CITY" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256

#### matching 

In [None]:
with open('/content/config_1880.json') as json_data_file:
    config = json.load(json_data_file)

es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

config['match_output1_filename'] = "/content/test_match_allcond.csv"
config['match_output2_filename'] = "/content/test_match_noward.csv"
config['match_output3_filename'] = "/content/test_match_nonamematch.csv"
config['match_output4_filename'] = "/content/test_match_onlymetaphone.csv"
config['identifier_col'] = ['identifier_col']

def match_addr():
    df = pd.read_csv(config['cd_filename'])
    count, match,unmatch = 0,0,0
    with open(config['match_output_filename'],'w') as fw, open(config['match_output1_filename'],'w') as fw1, open(config['match_output2_filename'],'w') as fw2, open(config['match_output3_filename'],'w') as fw3, open(config['match_output4_filename'],'w') as fw4, open(config['unmatch_output_filename'],'w') as fw5:
        writer = csv.writer(fw, delimiter="\t",quotechar='"')
        columns = config['identifier_col'] + config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""
        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))


        for index, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
            
            data["CD_FIRST_NAME"] = name_clean(data["CD_FIRST_NAME"])
            data["CD_LAST_NAME"] = name_clean(data["CD_LAST_NAME"])
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]

            if config['edit_distance'] !=0:
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                                    { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                                    { "match" : { "CENSUS_ED": data["CD_ED"]} },
                                    {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                                    {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]
                            }}

                    query3 = { "bool" : { "must" : [ { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            { "match" : { "CENSUS_ED": data["CD_ED"]} }, #add this back in once ed column data is fixed
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}
                            ]}}
                else:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} }]}}
            
            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
                res3 = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query3})
                #res2 = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query2})
                #res4 = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query4})
                

            except exceptions.RequestError:
                print("Exception at row id: ", index)
                continue
            
            if res['hits']['total']['value']!= 0:
                match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["identifier_col"]:
                        content = content + 'allconditions' + "\t"
                    
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))
                    fw1.write(str(data['CD_RECORD_ID'])+"\n")

            
            elif res3['hits']['total']['value']!= 0:
                match+=1
                for i in res3['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["identifier_col"]:
                        content = content + 'nonamematch' + "\t"

                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))
                    fw3.write(str(data['CD_RECORD_ID'])+"\n")

            else:
                fw5.write(str(data['CD_RECORD_ID'])+"\n")
                unmatch+=1
        
    print(count,match,unmatch)

def name_clean(name):
  name = max(name.split(' '), key=len) # remove middle name if any
  name = re.sub('[^A-Za-z0-9]+', '', name)
   
  return name
  

if __name__=='__main__':
    match_addr()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Exception at row id:  9379
Exception at row id:  9394
Exception at row id:  9424
Exception at row id:  9426
Exception at row id:  9465
Exception at row id:  9490
Exception at row id:  9492
Exception at row id:  9513
Exception at row id:  9516
Exception at row id:  9520
Exception at row id:  9531
Exception at row id:  9536
Exception at row id:  9546
Exception at row id:  9568
Exception at row id:  9611
Exception at row id:  9612
Exception at row id:  9721
Exception at row id:  9758
Exception at row id:  9811
Exception at row id:  9882
Exception at row id:  9889
Exception at row id:  9922
Exception at row id:  9991
Exception at row id:  10113
Exception at row id:  10129
Exception at row id:  10152
Exception at row id:  10173
Exception at row id:  10183
Exception at row id:  10192
Exception at row id:  10248
Exception at row id:  10279
Exception at row id:  10280
Exception at row id:  10289
Exception at row id:  10336
Except

### remove duplicates and resave the matched files. combine them to form one consolidated elastic search match file

In [None]:
# find the count of matches in each config file
import pandas as pd

test_match_allcond = pd.read_csv("/content/test_match_allcond.csv", error_bad_lines=False, header = None) # match with all conditions
#test_match_noward = pd.read_csv("/content/test_match_noward.csv", error_bad_lines=False, header = None) # match after removing ward no
test_match_nonamematch = pd.read_csv("/content/test_match_nonamematch.csv", error_bad_lines=False, header = None) # match with ward no and metaphones
#test_match_onlymetaphone = pd.read_csv("/content/test_match_onlymetaphone.csv", error_bad_lines=False, header = None) # match with only metaphones
test_nomatch = pd.read_csv("/content/test-u.csv", error_bad_lines=False, header = None) # non matched cd records
test_match = pd.read_csv("/content/test.csv", error_bad_lines=False, delimiter='\t') # matched information

# remove duplicates
test_match_allcond.drop_duplicates(inplace=True)
#test_match_noward.drop_duplicates(inplace=True)
test_match_nonamematch.drop_duplicates(inplace=True)
#test_match_onlymetaphone.drop_duplicates(inplace=True)
test_nomatch.drop_duplicates(inplace=True)
test_match.drop_duplicates(inplace=True)

print(len(test_match_allcond))
#print(len(test_match_noward)) # no ward
print(len(test_match_nonamematch)) # uses ward no, ed and metaphones
#print(len(test_match_onlymetaphone)) # no ward, first and last names
print(len(test_nomatch))
print(len(test_match))

b'Skipping line 1103: expected 1 fields, saw 2\nSkipping line 1477: expected 1 fields, saw 2\nSkipping line 1660: expected 1 fields, saw 2\nSkipping line 5518: expected 1 fields, saw 2\nSkipping line 17260: expected 1 fields, saw 2\nSkipping line 17954: expected 1 fields, saw 2\nSkipping line 18701: expected 1 fields, saw 2\nSkipping line 19197: expected 1 fields, saw 2\nSkipping line 19227: expected 1 fields, saw 2\nSkipping line 20222: expected 1 fields, saw 2\nSkipping line 20223: expected 1 fields, saw 2\nSkipping line 20224: expected 1 fields, saw 2\nSkipping line 20570: expected 1 fields, saw 2\nSkipping line 21929: expected 1 fields, saw 2\nSkipping line 21930: expected 1 fields, saw 2\nSkipping line 24258: expected 1 fields, saw 2\nSkipping line 26902: expected 1 fields, saw 2\nSkipping line 29027: expected 1 fields, saw 2\nSkipping line 30860: expected 1 fields, saw 2\nSkipping line 34702: expected 1 fields, saw 2\nSkipping line 34703: expected 1 fields, saw 2\nSkipping line 3

81115
11825
90888
120020


In [None]:
# save all the files again

test_match_allcond.to_csv("/content/test_match_allcond.csv", index= False)
#test_match_noward.to_csv("/content/test_match_noward.csv", index= False)
test_match_nonamematch.to_csv("/content/test_match_nonamematch.csv", index= False)
#test_match_onlymetaphone.to_csv("/content/test_match_onlymetaphone.csv", index= False)
test_nomatch.to_csv("/content/test-u.csv", index= False)
test_match.to_csv("/content/test_matched.csv", index= False)

# 1850 mn elastic search


In [None]:
server = Popen(['elasticsearch-7.8.1/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
!sleep 30

### replicate work done by previosu RAs

##### data ingestion

In [None]:
# data ingestion

es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename="/content/bulk_insert.log",
                            filemode='a',
                            format='%(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('InsertTime')

def ingest(config):
    df = pd.read_csv(config['census_filename'])
    bulk_data = []
    count = 0 
    for itr, row in df.iterrows():
        count+=1
        row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
        data = row.to_dict() #converts the dataframe row to dictonary with their correct data type
        if 'LOCATION' in data:
          data['LOCATION'] = {"lat":data["LAT"],"lon":data["LONG"]}
        
        if 'ADDNUMFROM' in data and type(data['ADDNUMFROM']) is str:
            data['ADDNUMFROM'] = data['ADDNUMFROM'].replace('`','')

        data[config['census_first_name']] = name_clean(data[config['census_first_name']])
        data[config['census_last_name']] = name_clean(data[config['census_last_name']])
        
        if config['metaphone'] is 1:
          data['METAPHONE_NAMEFIRST'] = [i for i in doublemetaphone(data[config['census_first_name']]) if i]
          data['METAPHONE_NAMELAST'] = [i for i in doublemetaphone(data[config['census_last_name']]) if i]
        
        if id is not False:
          meta = {
              "_index": config['es-index'],
              "_id": data[config['es-id']],
              "_source": data
          }
        else:
          meta = {
              "_index": config['es-index'],
              "_source": data
          }

        bulk_data.append(meta)
        if itr%config['ingest_size'] == 0:
            helpers.bulk(es, bulk_data)
            bulk_data = []
            print("INSERTING NOW", itr)
            
    helpers.bulk(es, bulk_data)
    return count

def name_clean(name):
  return max(name.split(' '), key=len)

if __name__=='__main__':
    if __name__ == '__main__':
      #parser = ArgumentParser()
      #parser.add_argument("-config", help="config file path", default="/content/config_1850.json")

      #args = parser.parse_args()

      #with open(args.config) as json_data_file:
      #  config = json.load(json_data_file)
      with open('/content/config_1850.json') as json_data_file:
         config = json.load(json_data_file)


      st = time.time()
      ingest(config)
      end = time.time()
      logger.warning(config["es-index"] +" "+ str(end-st))
 
#Mapping used
'''
PUT census
{
    "mappings" : {
      "properties" : {
        "ADDNUM" : {
          "type" : "long"
        },
        "ADDNUMFROM" : {
          "type" : "long"
        },
        "ADDNUMTO" : {
          "type" : "long"
        },
        "ADDR_TYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ADDRESSB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_AGEB" : {
          "type" : "long"
        },
        "CENSUS_BUILDING_I" : {
          "type" : "long"
        },
        "CENSUS_CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ED" : {
          "type" : "long"
        },
        "CENSUS_ENUMDISTB" : {
          "type" : "long"
        },
        "CENSUS_EXTGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_FID" : {
          "type" : "long"
        },
        "CENSUS_MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_MERGEID" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMEFRSTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMELASTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NEIGHBOR_1" : {
          "type" : "long"
        },
        "CENSUS_NEIGHBOR_2" : {
          "type" : "long"
        },
        "CENSUS_NPERHHB" : {
          "type" : "long"
        },
        "CENSUS_OCCLABELB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_PAGENUMB" : {
          "type" : "long"
        },
        "CENSUS_RACEB" : {
          "type" : "long"
        },
        "CENSUS_RACENAMEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_REELB" : {
          "type" : "long"
        },
        "CENSUS_RELATEB" : {
          "type" : "long"
        },
        "CENSUS_RELATE_STR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_SEGGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_SEGMENT_ID" : {
          "type" : "long"
        },
        "CENSUS_SERIAL" : {
          "type" : "long"
        },
        "CENSUS_SERIALB" : {
          "type" : "long"
        },
        "CENSUS_SEXB" : {
          "type" : "long"
        },
        "CENSUS_STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREET" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREETB" : {
          "type" : "long"
        },
        "CENSUS_TYPEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_UNITTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_VOLUMEB" : {
          "type" : "long"
        },
        "CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "COUNTY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "LOCATION":{
             "type": "geo_point"
        },
        "LAT" : {
          "type" : "float"
        },
        "LONG" : {
          "type" : "float"
        },
        "MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "METAPHONE_NAMEFIRST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "METAPHONE_NAMELAST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "OBJECTID" : {
          "type" : "long"
        },
        "SIDE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STNAME" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPREDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPRETYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "WARD_NUM" : {
          "type" : "long"
        }
      }
    }
  }
'''

##### matching - 25k cd matched to 60k census records

In [None]:
with open('/content/config_1850.json') as json_data_file:
    config = json.load(json_data_file)

es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

config['match_output_filename'] = "/content/test_1850mn_old.csv"
config['unmatch_output_filename'] = "/content/test-u_1850mn_old.csv"

def match_addr():
    df = pd.read_csv(config['cd_filename'])
    count, match,unmatch = 0,0,0
    with open(config['match_output_filename'],'w') as fw, open(config['unmatch_output_filename'],'w') as fw2:
        writer = csv.writer(fw, delimiter="\t",quotechar='"')
        columns = config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""
        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))
        for index, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]

            if config['edit_distance'] !=0:
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}

                else:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} }]}}
            

            if config['edit_distance'] is 0:
                if config['metaphone'] is 1:
                   query =  { "bool" : { "must" : [{ "match": { "CENSUS_FIRST_NAME": data["CD_FIRST_NAME"] } },
                            { "match": { "CENSUS_LAST_NAME": data["CD_LAST_NAME"] } },
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}
            
                else:
                    query = { "bool" : { "must" : [{ "match": { "CENSUS_FIRST_NAME": data["CD_FIRST_NAME"] } },
                            { "match": { "CENSUS_LAST_NAME": data["CD_LAST_NAME"] } },
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]}}]}}

            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
            except exceptions.RequestError:
                print("Exception at row id: ", index)
                continue
            
            if res['hits']['total']['value']!= 0:
                match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))
            else:
                fw2.write(str(data['CD_RECORD_ID'])+"\n")
                unmatch+=1
        
    print(count,match,unmatch)

if __name__=='__main__':
    match_addr()

### 1850 mn changes:
1. name cleaning - remove few identfied special characters. can be more which went unrecognized

2. durng matching used multiple criteria:
    * first match on all conditions: first name, last name, ward no and metaphones
    * for remaining unmatched records match with ward no and metaphones only and remove match condition for first name and last name
    *  for unmacthed records match after removing ward no only
    * for remianing unmatched match only on metaphones after remvoving matching conditions for war no, first name and last name





##### data ingestion

In [None]:
import re
es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

logging.basicConfig(filename="/content/bulk_insert.log",
                            filemode='a',
                            format='%(asctime)s %(name)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.WARNING)
logger = logging.getLogger('InsertTime')

def ingest(config):
    df = pd.read_csv(config['census_filename'])
    bulk_data = []
    count = 0 
    for itr, row in df.iterrows():
        count+=1
        row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
        data = row.to_dict() #converts the dataframe row to dictonary with their correct data type
        if 'LOCATION' in data:
          data['LOCATION'] = {"lat":data["LAT"],"lon":data["LONG"]}
        
        if 'ADDNUMFROM' in data and type(data['ADDNUMFROM']) is str:
            data['ADDNUMFROM'] = data['ADDNUMFROM'].replace('`','')

        data[config['census_first_name']] = name_clean(data[config['census_first_name']])
        data[config['census_last_name']] = name_clean(data[config['census_last_name']])
        
        if config['metaphone'] is 1:
          data['METAPHONE_NAMEFIRST'] = [i for i in doublemetaphone(data[config['census_first_name']]) if i]
          data['METAPHONE_NAMELAST'] = [i for i in doublemetaphone(data[config['census_last_name']]) if i]
        
        if id is not False:
          meta = {
              "_index": config['es-index'],
              "_id": data[config['es-id']],
              "_source": data
          }
        else:
          meta = {
              "_index": config['es-index'],
              "_source": data
          }

        bulk_data.append(meta)
        if itr%config['ingest_size'] == 0:
            helpers.bulk(es, bulk_data)
            bulk_data = []
            print("INSERTING NOW", itr)
            
    helpers.bulk(es, bulk_data)
    return count

def name_clean(name):
  name = max(name.split(' '), key=len) # remove middle name if any
  name = re.sub('[^A-Za-z0-9]+', '', name)
   
  return name

if __name__=='__main__':
    if __name__ == '__main__':
      #parser = ArgumentParser()
      #parser.add_argument("-config", help="config file path", default="/content/config_1850.json")

      #args = parser.parse_args()

      #with open(args.config) as json_data_file:
      #  config = json.load(json_data_file)
      with open('/content/config_1850.json') as json_data_file:
         config = json.load(json_data_file)


      st = time.time()
      ingest(config)
      end = time.time()
      logger.warning(config["es-index"] +" "+ str(end-st))
 
#Mapping used
'''
PUT census
{
    "mappings" : {
      "properties" : {
        "ADDNUM" : {
          "type" : "long"
        },
        "ADDNUMFROM" : {
          "type" : "long"
        },
        "ADDNUMTO" : {
          "type" : "long"
        },
        "ADDR_TYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ADDRESSB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_AGEB" : {
          "type" : "long"
        },
        "CENSUS_BUILDING_I" : {
          "type" : "long"
        },
        "CENSUS_CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_ED" : {
          "type" : "long"
        },
        "CENSUS_ENUMDISTB" : {
          "type" : "long"
        },
        "CENSUS_EXTGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_FID" : {
          "type" : "long"
        },
        "CENSUS_MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_MERGEID" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMEFRSTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NAMELASTB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_NEIGHBOR_1" : {
          "type" : "long"
        },
        "CENSUS_NEIGHBOR_2" : {
          "type" : "long"
        },
        "CENSUS_NPERHHB" : {
          "type" : "long"
        },
        "CENSUS_OCCLABELB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_PAGENUMB" : {
          "type" : "long"
        },
        "CENSUS_RACEB" : {
          "type" : "long"
        },
        "CENSUS_RACENAMEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_REELB" : {
          "type" : "long"
        },
        "CENSUS_RELATEB" : {
          "type" : "long"
        },
        "CENSUS_RELATE_STR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_SEGGROUP_I" : {
          "type" : "long"
        },
        "CENSUS_SEGMENT_ID" : {
          "type" : "long"
        },
        "CENSUS_SERIAL" : {
          "type" : "long"
        },
        "CENSUS_SERIALB" : {
          "type" : "long"
        },
        "CENSUS_SEXB" : {
          "type" : "long"
        },
        "CENSUS_STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREET" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_STREETB" : {
          "type" : "long"
        },
        "CENSUS_TYPEB" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_UNITTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CENSUS_VOLUMEB" : {
          "type" : "long"
        },
        "CITY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "COUNTY" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "LOCATION":{
             "type": "geo_point"
        },
        "LAT" : {
          "type" : "float"
        },
        "LONG" : {
          "type" : "float"
        },
        "MATCH_ADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "METAPHONE_NAMEFIRST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "METAPHONE_NAMELAST": {
          "type": "text",
          "fields": {
            "keyword": {
                "type": "keyword",
                "ignore_above": 256
            }
          }
        },
        "OBJECTID" : {
          "type" : "long"
        },
        "SIDE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STADDR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STATE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STNAME" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPREDIR" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STPRETYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "STTYPE" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "WARD_NUM" : {
          "type" : "long"
        }
      }
    }
  }
'''

INSERTING NOW 0
INSERTING NOW 5000
INSERTING NOW 10000
INSERTING NOW 15000
INSERTING NOW 20000
INSERTING NOW 25000
INSERTING NOW 30000
INSERTING NOW 35000
INSERTING NOW 40000
INSERTING NOW 45000
INSERTING NOW 50000
INSERTING NOW 55000
INSERTING NOW 60000
INSERTING NOW 65000
INSERTING NOW 70000
INSERTING NOW 75000
INSERTING NOW 80000
INSERTING NOW 85000
INSERTING NOW 90000
INSERTING NOW 95000
INSERTING NOW 100000
INSERTING NOW 105000
INSERTING NOW 110000
INSERTING NOW 115000
INSERTING NOW 120000
INSERTING NOW 125000
INSERTING NOW 130000
INSERTING NOW 135000
INSERTING NOW 140000
INSERTING NOW 145000
INSERTING NOW 150000
INSERTING NOW 155000
INSERTING NOW 160000
INSERTING NOW 165000
INSERTING NOW 170000
INSERTING NOW 175000
INSERTING NOW 180000
INSERTING NOW 185000
INSERTING NOW 190000
INSERTING NOW 195000
INSERTING NOW 200000
INSERTING NOW 205000
INSERTING NOW 210000
INSERTING NOW 215000
INSERTING NOW 220000
INSERTING NOW 225000
INSERTING NOW 230000
INSERTING NOW 235000
INSERTING NOW 240

'\nPUT census\n{\n    "mappings" : {\n      "properties" : {\n        "ADDNUM" : {\n          "type" : "long"\n        },\n        "ADDNUMFROM" : {\n          "type" : "long"\n        },\n        "ADDNUMTO" : {\n          "type" : "long"\n        },\n        "ADDR_TYPE" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256\n            }\n          }\n        },\n        "CENSUS_ADDRESSB" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256\n            }\n          }\n        },\n        "CENSUS_AGEB" : {\n          "type" : "long"\n        },\n        "CENSUS_BUILDING_I" : {\n          "type" : "long"\n        },\n        "CENSUS_CITY" : {\n          "type" : "text",\n          "fields" : {\n            "keyword" : {\n              "type" : "keyword",\n              "ignore_above" : 256

##### matching

In [None]:
with open('/content/config_1850.json') as json_data_file:
    config = json.load(json_data_file)

es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)

config['match_output1_filename'] = "/content/test_match_allcond.csv"
config['match_output2_filename'] = "/content/test_match_noward.csv"
config['match_output3_filename'] = "/content/test_match_nonamematch.csv"
config['match_output4_filename'] = "/content/test_match_onlymetaphone.csv"
config['identifier_col'] = ['identifier_col']

def match_addr():
    df = pd.read_csv(config['cd_filename'])
    count, match,unmatch = 0,0,0
    with open(config['match_output_filename'],'w') as fw, open(config['match_output1_filename'],'w') as fw1, open(config['match_output2_filename'],'w') as fw2, open(config['match_output3_filename'],'w') as fw3, open(config['match_output4_filename'],'w') as fw4, open(config['unmatch_output_filename'],'w') as fw5:
        writer = csv.writer(fw, delimiter="\t",quotechar='"')
        columns = config['identifier_col'] + config["output_census_cols"] + config["output_city_directory_cols"]
        rows=""
        for cols in columns:
            rows = rows + cols + "\t"
        
        writer.writerow(rows.rstrip("\t").split("\t"))


        for index, row in df.iterrows():
            row = row.replace(np.nan,'',regex=True) #covnert nan to empty string
            data = row.to_dict()
            
            data["CD_FIRST_NAME"] = name_clean(data["CD_FIRST_NAME"])
            data["CD_LAST_NAME"] = name_clean(data["CD_LAST_NAME"])
            
            first_name_metaphone = [i for i in doublemetaphone(data["CD_FIRST_NAME"]) if i]
            last_name_metaphone = [i for i in doublemetaphone(data["CD_LAST_NAME"]) if i]

            if config['edit_distance'] !=0:
                if config['metaphone'] is 1:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}

                    query2 = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                            #{ "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}

                    query3 = { "bool" : { "must" : [{ "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}


                    query4 = { "bool" : { "must" : [
                            {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}},
                            {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}
                else:
                    query = { "bool" : { "must" : [{ "fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } } },
                            {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
                            { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} }]}}
            


            #if (data["CD_FIRST_NAME"] == ''):
            #        query = { "bool" : { "must" : [
            #                {"fuzzy": { "CENSUS_LAST_NAME": { "value": data["CD_LAST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
            #                { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
            #                {"terms": {"METAPHONE_NAMELAST.keyword": last_name_metaphone}}]}}
            
            #if (data["CD_LAST_NAME"] == ''):
            #        query = { "bool" : { "must" : [
            #                {"fuzzy": { "CENSUS_FIRST_NAME": { "value": data["CD_FIRST_NAME"], "fuzziness": config["edit_distance"], "max_expansions": 50, "prefix_length": 0, "transpositions": True, "rewrite": "constant_score" } }},
            #                { "match" : { "CENSUS_WARD_NUM": data["CD_WARD_NUM"]} },
            #                {"terms": {"METAPHONE_NAMEFIRST.keyword": first_name_metaphone}}]}}
            
            
            try:
                res = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query})
                res3 = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query3})
                res2 = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query2})
                res4 = es.search(index=config["es-index"], body={ "from": 0, "size": 10000, "query":query4})
                

            except exceptions.RequestError:
                print("Exception at row id: ", index)
                continue
            
            if res['hits']['total']['value']!= 0:
                match+=1
                for i in res['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["identifier_col"]:
                        content = content + 'allconditions' + "\t"
                    
                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))
                    fw1.write(str(data['CD_RECORD_ID'])+"\n")

            
            elif res3['hits']['total']['value']!= 0:
                match+=1
                for i in res3['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["identifier_col"]:
                        content = content + 'nonamematch' + "\t"

                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))
                    fw3.write(str(data['CD_RECORD_ID'])+"\n")

            elif res2['hits']['total']['value']!= 0:
                match+=1
                for i in res2['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["identifier_col"]:
                        content = content + 'noward' + "\t"

                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))
                    fw2.write(str(data['CD_RECORD_ID'])+"\n")


            elif res4['hits']['total']['value']!= 0:
                match+=1
                for i in res4['hits']['hits']:
                    i = i['_source']
                    content = ""
                    for j in config["identifier_col"]:
                        content = content + 'onlymetaphone' + "\t"

                    for j in config["output_census_cols"]:
                        content = content + str(i[j]) + "\t"

                    for j in config["output_city_directory_cols"]:
                        content = content + str(data[j]) + "\t"

                    writer.writerow(content.rstrip("\t").split("\t"))
                    fw4.write(str(data['CD_RECORD_ID'])+"\n")

            else:
                fw5.write(str(data['CD_RECORD_ID'])+"\n")
                unmatch+=1
        
    print(count,match,unmatch)

def name_clean(name):
  name = max(name.split(' '), key=len) # remove middle name if any
  name = re.sub('[^A-Za-z0-9]+', '', name)
   
  return name
  

if __name__=='__main__':
    match_addr()

0 42961 14786


### Remove duplicates, check no of rows in each condition and resave the files

In [None]:
# find the count of matches in each config file
import pandas as pd
test_match_allcond = pd.read_csv("/content/test_match_allcond.csv", error_bad_lines=False, header = None) # match with all conditions
test_match_noward = pd.read_csv("/content/test_match_noward.csv", error_bad_lines=False, header = None) # match after removing ward no
test_match_nonamematch = pd.read_csv("/content/test_match_nonamematch.csv", error_bad_lines=False, header = None) # match with ward no and metaphones
test_match_onlymetaphone = pd.read_csv("/content/test_match_onlymetaphone.csv", error_bad_lines=False, header = None) # match with only metaphones
test_nomatch = pd.read_csv("/content/test-u.csv", error_bad_lines=False, header = None) # non matched cd records
test_match = pd.read_csv("/content/test.csv", error_bad_lines=False, delimiter='\t') # matched information

# remove duplicates
test_match_allcond.drop_duplicates(inplace=True)
test_match_noward.drop_duplicates(inplace=True)
test_match_nonamematch.drop_duplicates(inplace=True)
test_match_onlymetaphone.drop_duplicates(inplace=True)
test_nomatch.drop_duplicates(inplace=True)
test_match.drop_duplicates(inplace=True)

In [None]:
print(len(test_match_allcond))
print(len(test_match_noward)) # no ward
print(len(test_match_nonamematch)) # uses ward no and metaphones
print(len(test_match_onlymetaphone)) # no ward, first and last names
print(len(test_nomatch))
print(len(test_match))

25492
6402
7152
3833
14737
135302


In [None]:
# save all the files again

test_match_allcond.to_csv("/content/test_match_allcond.csv", index= False)
test_match_noward.to_csv("/content/test_match_noward.csv", index= False)
test_match_nonamematch.to_csv("/content/test_match_nonamematch.csv", index= False)
test_match_onlymetaphone.to_csv("/content/test_match_onlymetaphone.csv", index= False)
test_nomatch.to_csv("/content/test-u.csv", index= False)
test_match.to_csv("/content/test_matched.csv", index= False)