In [3]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

import pandas as pd
import json
import re
import datetime

In [4]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [128]:
def JSON_converter(header_lst, val_lst):
    output = {}
    for header, val in zip(header_lst, val_lst):
        output[header] = val
     
    #return output
    return json.dumps(output, indent = 2)

def JSON_generator(input_df):
    header_lst = input_df.columns.tolist()
    header_lst.remove('contract_id')
    header_lst.remove('file_name')
    for idx, row_val in input_df.iterrows():
        file_name = row_val[-1]
        file_path = 'data/Txt files/'+file_name
        contract_id = row_val[0]
        JSON_data = row_val[1:-1].tolist()
        try:
            with open(file_path, 'r' , encoding='utf-8') as f:
                text_data = f.read().replace('\n', '').replace(u'\xa0',' ')
            header_lst.append('text_content') 
            JSON_data.append(text_data)
            
            yield contract_id, JSON_converter(header_lst, JSON_data)
            
        except:
            print('cannot process txt_file {}'.format(file_name))

def process_dataSource(input_df):
    
    output_df = input_df.copy()
    output_df['Contract Duration (Days)'] = output_df['Contract Duration (Days)'].fillna('0')
    output_df['Contract Duration (Days)'] = output_df['Contract Duration (Days)'].apply(lambda x: int(re.sub('[^0-9]', '', x)))
    output_df['contract_id'] = output_df['contract_id'].astype(int)
    
    output_df['Effective Date'] = pd.to_datetime(output_df['Effective Date'], utc=True)
    output_df['Expiration Date'] = pd.to_datetime(output_df['Expiration Date'], utc=True)
    output_df['Effective Date'].fillna(datetime.datetime(2100,1,1), inplace = True)
    output_df['Expiration Date'].fillna(datetime.datetime(2100,1,1), inplace = True)
    #output_df['Effective Date'] = output_df['Effective Date'].strftime('%Y-%m-%d')
    #output_df['Expiration Date'] = output_df['Expiration Date'].strftime('%Y-%m-%d')                                                                      
    output_df.fillna('', inplace = True)
    
    return output_df

ContractMetaData = pd.read_csv('data/300_metadata.csv')
processed_data = process_dataSource(ContractMetaData)
contract_g = JSON_generator(processed_data)

In [52]:
#text_template = {
#                "type": "text",
#                "analyzer": "standard",
#                "fields": {
#                    "keyword": {"type": "keyword"},
#                    "ngrams": {"type": "text", "analyzer": "ngram_analyzer"},
#                }}

In [108]:
text_template = {
                "type": "text",
                "analyzer": "standard",
                }
    
date_template = {"type": "date"}
    
numeric_template = {"type": "integer"}

def ESupdate_mapping(es, index_name, header_lst, remove_lst):
    for property_ in header_lst:
        if property_ in remove_lst:
            continue
            
        update_mapping = {}
        property_json = {}
        if property_ == 'Contract Duration (Days)':
            property_json[property_] = numeric_template
        elif property_ in ['Effective Date', 'Expiration Date']:
            property_json[property_] = date_template
        else:
            property_json[property_] = text_template
            
        update_mapping["properties"] = property_json
        updates = json.dumps(update_mapping, indent = 2)
        es.indices.put_mapping(updates, index_name)

index_name = "test-index2"
es.index(index=index_name , document={'author': 'lucas'})
test_lst = processed_data.columns.tolist() + ['text_content']
remove_lst = ['contract_id', 'file_name']
ESupdate_mapping(es, index_name , test_lst, remove_lst)

  es.indices.put_mapping(updates, index_name)


In [154]:
test = 'data/Txt files/21697_ADVISORY_AGREEMENT (3).doc_20210524_143234.txt'

with open(test, 'r' , encoding='utf-8') as f:
    text_data = f.read().replace('\n', '').replace(u'\xa0',' ')

In [127]:
for id_, data in contract_g:
    res = es.index(index=index_name, id = id_, body=data)

  res = es.index(index=index_name, id = id_, body=data)


In [129]:
es.search(index=index_name, query={"match_all": {}})

{'took': 4171,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 301, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'test-index2',
    '_type': '_doc',
    '_id': 'uRSHkH0B75hjgPSjKM7m',
    '_score': 1.0,
    '_source': {'author': 'lucas'}},
   {'_index': 'test-index2',
    '_type': '_doc',
    '_id': '8730',
    '_score': 1.0,
    '_ignored': ['text_content.keyword'],
    '_source': {'Alternate Dispute Resolution Terms': 'NO',
     'Confidentiality Terms': 'YES',
     'Contract Duration (Days)': 0,
     'Data Incident Response For Data Breaches Term': 'NO',
     'Effective Date': '2019-11-08T00:00:00+00:00',
     'Expiration Date': '2100-01-01T00:00:00',
     'Governing Law': 'the United States of America',
     'Indemnity Terms': 'YES',
     'Jurisdiction': 'Texas',
     'Limitation Of Liability Terms': 'NO',
     'Non Disclosure Terms': 'YES',
     'Non Solicit Terms': 'NO',
     'Party 1 Address':

In [147]:
es.indices.get_mapping(index_name)

  es.indices.get_mapping(index_name)


{'test-index2': {'mappings': {'properties': {'Alternate Dispute Resolution Terms': {'type': 'text',
     'analyzer': 'standard'},
    'Confidentiality Terms': {'type': 'text', 'analyzer': 'standard'},
    'Contract Duration (Days)': {'type': 'integer'},
    'Data Incident Response For Data Breaches Term': {'type': 'text',
     'analyzer': 'standard'},
    'Effective Date': {'type': 'date'},
    'Expiration Date': {'type': 'date'},
    'Governing Law': {'type': 'text', 'analyzer': 'standard'},
    'Indemnity Terms': {'type': 'text', 'analyzer': 'standard'},
    'Jurisdiction': {'type': 'text', 'analyzer': 'standard'},
    'Limitation Of Liability Terms': {'type': 'text', 'analyzer': 'standard'},
    'Non Disclosure Terms': {'type': 'text', 'analyzer': 'standard'},
    'Non Solicit Terms': {'type': 'text', 'analyzer': 'standard'},
    'Party 1 Address': {'type': 'text', 'analyzer': 'standard'},
    'Party 1 Name': {'type': 'text', 'analyzer': 'standard'},
    'Party 1 Notice Address': {'

In [35]:
help(es.indices)

Help on IndicesClient in module elasticsearch.client.indices object:

class IndicesClient(elasticsearch.client.utils.NamespacedClient)
 |  IndicesClient(client)
 |  
 |  Method resolution order:
 |      IndicesClient
 |      elasticsearch.client.utils.NamespacedClient
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  add_block(self, index, block, params=None, headers=None)
 |      Adds a block to an index.
 |      
 |      `<https://www.elastic.co/guide/en/elasticsearch/reference/7.15/index-modules-blocks.html>`_
 |      
 |      :arg index: A comma separated list of indices to add a block to
 |      :arg block: The block to add (one of read, write, read_only or
 |          metadata)
 |      :arg allow_no_indices: Whether to ignore if a wildcard indices
 |          expression resolves into no concrete indices. (This includes `_all`
 |          string or when no indices have been specified)
 |      :arg expand_wildcards: Whether to expand wildcard expression to
 |         

In [148]:
res = es.search(index="test-index2", query={"match": {'text_content': 'Clark R. MooreName: Clark R'}})

In [150]:
res = es.search(index="test-index2", query={"range": {"Expiration Date": {"gte": "2020-01-01T00:00:00","lte": "now"}}})

In [151]:
search_res = res['hits']['hits']
print(len(search_res))

for item in search_res:
    print("id: {}, score: {}".format(item['_id'], item['_score']))
    print(item['_source']['Expiration Date'])

6
id: 8734, score: 1.0
2021-10-01T00:00:00+00:00
id: 8748, score: 1.0
2021-03-31T00:00:00+00:00
id: 8789, score: 1.0
2020-12-31T00:00:00+00:00
id: 8798, score: 1.0
2020-12-31T00:00:00+00:00
id: 8807, score: 1.0
2020-12-31T00:00:00+00:00
id: 8820, score: 1.0
2020-12-31T00:00:00+00:00


In [139]:
es.get(index="test-index2", id=8730)

{'_index': 'test-index2',
 '_type': '_doc',
 '_id': '8730',
 '_version': 1,
 '_seq_no': 1,
 '_primary_term': 1,
 '_ignored': ['text_content.keyword'],
 'found': True,
 '_source': {'Alternate Dispute Resolution Terms': 'NO',
  'Confidentiality Terms': 'YES',
  'Contract Duration (Days)': 0,
  'Data Incident Response For Data Breaches Term': 'NO',
  'Effective Date': '2019-11-08T00:00:00+00:00',
  'Expiration Date': '2100-01-01T00:00:00',
  'Governing Law': 'the United States of America',
  'Indemnity Terms': 'YES',
  'Jurisdiction': 'Texas',
  'Limitation Of Liability Terms': 'NO',
  'Non Disclosure Terms': 'YES',
  'Non Solicit Terms': 'NO',
  'Party 1 Address': 'business at 575 N. Dairy Ashford, Energy Center II, Suite 210, Houston, Texas 77079',
  'Party 1 Name': 'PEDEVCO Corp.',
  'Party 1 Notice Address': 'and state courts located in Texas',
  'Party 1 Notice Country': '',
  'Party 1 Notice Email': '',
  'Party 1 Notice State': '',
  'Party 1 Notice Zip': '',
  'Party 1 State': 'Te

In [145]:
es.transport.close()