In [57]:
import json
from es_connection import EsManagement
import os
import time

import warnings
warnings.filterwarnings('ignore')


In [58]:
# import codecs
# BLOCKSIZE = 1048576 # or some other, desired size in bytes
# with codecs.open('data/addresses_geocode_results.csv', "r") as sourceFile:
#     with codecs.open('data/addresses_geocode_results2.csv', "w", "utf-8") as targetFile:
#         while True:
#             contents = sourceFile.read(BLOCKSIZE)
#             if not contents:
#                 break
#             targetFile.write(contents)

In [59]:
with open('elasticsearch-config/address_mapping.json', encoding='utf-8') as f:
    address_mapping = json.load(f)

index_name = "temp"

es_connection = EsManagement()
es_connection.clear_index(index_name=index_name)
es_connection.create_index(index_name=index_name, mapping=address_mapping)

{'acknowledged': True}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'temp'}


In [4]:
# print(
#   json.dumps(
#     es_connection.es.indices.get_mapping(index=index_name), 
#     indent=1)
# )

<h1> INDEXING </h1>

In [60]:
from elasticsearch import helpers, Elasticsearch
import csv

es = Elasticsearch()

def generateBulkPayload(csv_reader):
    for row in csv_reader:
        # handle your parsing here
 
        # overwriting the `created` attribute
        row.update(dict(coordinates=eval(row.get('coordinates'))))
#         row.update(dict(address=str(row.get('address').encode('utf-8'))))

        yield row

with open('data/addresses.csv', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    helpers.bulk(es, reader, index=index_name)

<h1> SEARCH </h1>

In [61]:
result = es.search(index=index_name, body={"query":{"match_all":{}}})

In [62]:
result

{'took': 7,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 1.0,
  'hits': [{'_index': 'temp',
    '_type': '_doc',
    '_id': 'QuIsx3wBA11KtBU71-BM',
    '_score': 1.0,
    '_source': {'address': 'تهران،خ. استاد حسن بنا شمالی، بعد از چهارراه پرتوی، خ. جانسپار.کوچه ی دیدار.',
     'building_no': '26',
     'unit': '4'}},
   {'_index': 'temp',
    '_type': '_doc',
    '_id': 'Q-Isx3wBA11KtBU71-BM',
    '_score': 1.0,
    '_source': {'address': 'تهران،مجیدیه شمالی، خ. منصوری، بن. بیدهندی',
     'building_no': '10',
     'unit': '5'}},
   {'_index': 'temp',
    '_type': '_doc',
    '_id': 'ROIsx3wBA11KtBU71-BM',
    '_score': 1.0,
    '_source': {'address': 'تهران،مجیدیه، خ. عراقی، خ. شاهنده',
     'building_no': '24',
     'unit': '2'}},
   {'_index': 'temp',
    '_type': '_doc',
    '_id': 'ReIsx3wBA11KtBU71-BM',
    '_score': 1.0,
    '_source': {'address': 'تهران،کرمان، خ.

In [63]:
def parse_search_output(s_output):
    hits = s_output["hits"]
    result = []
    if hits["total"]["value"] > 0:
        hits = hits["hits"]
        for hit in hits:
            result.append(hit["_source"])
    return result

In [75]:
def customized_search(address_query='', building_no_query='', unit_query='', address_weight=1, building_no_weight=1, unit_weight=1):
    global es_connection, index_name
    
    es = es_connection.es
    
#     query = {
#         "size": 10, 
#         "query": {
#             "bool": {
#                 "should":[
#                      {"wildcard": {
#                         "address": {
#                             "value": '*' + address_query + '*',
#                             "boost": address_weight,
#                             "rewrite": "constant_score"}}},
                    
                    
#                     {"match": {
#                         "building_no": {
#                             "query": building_no_query,
#                             "boost" : building_no_weight}}},
#                     {"match": {"unit": {
#                         "query" : unit_query,
#                         "boost" : unit_weight}}}
#                 ]
#             }
#         },
#         "collapse": {
#             "field": "address.keyword"
#         }
#     }
    
#     query = {
#         "size": 10, 
#         "query": {
#             "bool": {
#                 "should":[
#                     {"multi_match": {
#                         "query": address_query,
#                         "type": "bool_prefix",
#                         "fields": ["address",
#                                    "address._2gram",
#                                    "address._3gram"],
#                         "boost" : address_weight}},
#                     {"match": {
#                         "building_no": {
#                             "query": building_no_query,
#                             "boost" : building_no_weight}}},
#                     {"match": {"unit": {
#                         "query" : unit_query,
#                         "boost" : unit_weight}}}
#                 ]
#             }
#         },
#         "collapse": {
#             "field": "address.keyword"
#         }
#     }

#     "match_bool_prefix"
    query = {
        "size": 10,
        "query": {
            "bool": {
                "should":[
                    {"match": {
                        "address": {
                        "query": address_query,
                        "fuzziness": "AUTO",
                        "boost" : address_weight}}},
                    {"match_phrase_prefix": {
                        "address": {
                        "query": address_query}}}
    #                     {"multi_match": {
    #                         "query": address_query,
    #                         "type": "bool_prefix",
    #                         "fields": ["address",
    #                                    "address._2gram",
    #                                    "address._3gram"],
    #                         "boost" : address_weight}},
    #                     {"match": {
    #                         "building_no": {
    #                             "query": building_no_query,
    #                             "boost" : building_no_weight}}},
    #                     {"match": {"unit": {
    #                         "query" : unit_query,
    #                         "boost" : unit_weight}}}
                ]
            }
        },
        "collapse": {
            "field": "address.keyword"
        }
    }

    
#     query = {
#         "size": 10, 
#         "query": {
#             "bool": {
#                 "must": [
#                         {
#                             "match_phrase_prefix": {
#                                 "address": {
#                                     "query": address_query
#                                 }
#                             }
#                         }]
#             }
            
#         },
#         "collapse": {
#             "field": "address.keyword"
#         }
#     }
    result = es.search(index=index_name, body=query)
    return result

In [77]:
result = customized_search("ستاری بالاتر مجتمع کوروش کوچه اسدی ")
print(result)

{'took': 46, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10000, 'relation': 'gte'}, 'max_score': None, 'hits': [{'_index': 'temp', '_type': '_doc', '_id': 'HuIsx3wBA11KtBU7vDD4', '_score': 114.489685, '_source': {'address': 'ستاری بالاتر مجتمع کوروش کوچه اسدی', 'building_no': '24', 'unit': '2'}, 'fields': {'address.keyword': ['ستاری بالاتر مجتمع کوروش کوچه اسدی']}}, {'_index': 'temp', '_type': '_doc', '_id': 'WuMsx3wBA11KtBU77Ztw', '_score': 15.438201, '_source': {'address': 'خیابان شریعتی،بالاتر از چهارراه شهید قدوسی،کوچه کوروش', 'building_no': '25', 'unit': 'همکف'}, 'fields': {'address.keyword': ['خیابان شریعتی،بالاتر از چهارراه شهید قدوسی،کوچه کوروش']}}, {'_index': 'temp', '_type': '_doc', '_id': 'Y-Isx3wBA11KtBU7xV9a', '_score': 15.175352, '_source': {'address': 'تهران،خ. ولیعصر، بالتر از میدان ونک ، بالاتر از میرداماد، بلوار هرمز ستاری', 'building_no': '77', 'unit': '3'}, 'fields': {'address.keyword': ['تهران

In [66]:
def neighbor_search(coordinates):
    global es_connection, index_name
    
    es = es_connection.es

    query = {
        "size": 5,
        "query": {
            "elastiknn_nearest_neighbors": { 
                "field": "coordinates",
                "vec": {                            
                "values": coordinates,
                },
                "model": "exact",                   
                "similarity": "l2",
            }
        }
    }
    
    result = es.search(index=index_name, body=query)
    return result

In [67]:
neighbor_search([35.7385784, 51.3136545])

{'took': 38,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 100, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'temp',
    '_type': '_doc',
    '_id': 'ftz4anwBI6QRxh8O9yl-',
    '_score': 1.0,
    '_source': {'address': 'ستاری بالاتر مجتمع کوروش کوچه اسدی',
     'building_no': '24',
     'unit': '2',
     'coordinates': [35.7385784, 51.3136545]}},
   {'_index': 'temp',
    '_type': '_doc',
    '_id': 'o9z4anwBI6QRxh8O9ymA',
    '_score': 0.9881734,
    '_source': {'address': 'تهران،راه آهن، خ. وحدت اسلامی، بن. حسین زاده',
     'building_no': '36',
     'unit': '8',
     'coordinates': [35.7475301, 51.3057145]}},
   {'_index': 'temp',
    '_type': '_doc',
    '_id': 'odz4anwBI6QRxh8O9ymA',
    '_score': 0.98571646,
    '_source': {'address': 'تهران،آریا شهر، اباذر، خ. پیامبر غربی، خ. سلمان فارسی',
     'building_no': '2',
     'unit': '12',
     'coordinates': [35.7365732, 51.3280063]}},
   {'_index

In [59]:
import math

def calculate_distance_between_two_latlong_coordinates(coordinate1, coordinate2):
    lat1, lon1 = coordinate1
    lat2, lon2 = coordinate2

    R = 6373.0

    lat1 = math.radians(lat1)

    lon1 = math.radians(lon1)

    lat2 = math.radians(lat2)

    lon2 = math.radians(lon2)


    dlon = lon2 - lon1

    dlat = lat2 - lat1


    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2


    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c


    print(distance)

In [61]:
calculate_distance_between_two_latlong_coordinates([35.7365732, 51.3280063], [35.7475301, 51.3057145])

2.352765768118365


In [50]:
pow((35.7385784 - 35.7475301),2)

8.013293288995043e-05

In [52]:
pow((8.013293288995043e-05+6.304359999996566e-05), 1/2)

0.011965639677422852

In [51]:
pow((51.3136545 - 51.3057145),2)

6.304359999996566e-05

In [13]:
result = customized_search(" ستاری")
print(result)

{'took': 55, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}}


In [None]:
def geocode(address):
    matched_addresses = customized_search(address)
    nighborrs = []
    for addr in matched_addresses:
        

In [19]:
import pandas as pd

read_file = pd.read_csv (r'data\addresses_geocode_results.csv')

In [20]:
read_file

Unnamed: 0,address,building_no,unit,coordinates
0,ستاری بالاتر مجتمع کوروش کوچه اسدی,24,2,"[35.7385784, 51.3136545]"
1,میدان کاج به طرف سرو غربی میدان قیصر امین پور ...,39,10,"[35.7822366, 51.37480799999999]"
2,پونک/بلوار میرزابابایی/خیابان ایران زمین شمالی...,30,3,"[35.7604901, 51.3324125]"
3,تهران،فرودگاه مهرآباد,7,8,"[35.68998819999999, 51.311241]"
4,تهران،خ. باهنر، نرسیده به سه راه یاسر، خ. صادق...,2,3,"[35.8196097, 51.4507191]"
...,...,...,...,...
95,تهران،استاد معین، بل استاد معین، نرسیده به خ. ...,32,4,"[35.6934182, 51.3448207]"
96,تهران،کارون شمالی، خ. کارون، خ. بوستان سعدی,24,3,"[35.6922333, 51.36879099999999]"
97,تهران،قیام، خ. ابمنگول، خ. قاسم گلی، خ. اخوان ...,31,4,"[35.6717418, 51.441097]"
98,تهران،سرو آزاد,5,8,"[35.7362417, 51.1912221]"


In [27]:
es_connection.es.get(index=index_name, id="apb6Y3wBWJ96Qa2i00Yk")

{'_index': 'temp',
 '_type': '_doc',
 '_id': 'apb6Y3wBWJ96Qa2i00Yk',
 '_version': 1,
 '_seq_no': 3,
 '_primary_term': 1,
 'found': True,
 '_source': {'address': 'تهران،فرودگاه مهرآباد',
  'building_no': '7',
  'unit': '8',
  'coordinates': [35.68998819999999, 51.311241]}}

In [52]:
start_time = time.time()
es_connection.load_csv_into_index(index_name=index_name, 
                             path=os.path.join("data", "addresses_geocode_results.csv"), generateBulkPayload(reader))
print("Elapsed time:", time.time() - start_time)

SyntaxError: positional argument follows keyword argument (<ipython-input-52-21b6bb4d6ff2>, line 3)