In [1]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

import pandas as pd
import json
import re
import datetime

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [3]:
def JSON_converter(header_lst, val_lst):
    output = {}
    for header, val in zip(header_lst, val_lst):
        output[header] = val
     
    #return output
    return json.dumps(output, indent = 2)

def JSON_generator(input_df):
    header_lst = input_df.columns.tolist()
    header_lst.remove('contract_id')
    header_lst.remove('file_name')
    for idx, row_val in input_df.iterrows():
        file_name = row_val[-1]
        file_path = 'data/Txt files/'+file_name
        contract_id = row_val[0]
        JSON_data = row_val[1:-1].tolist()
        try:
            with open(file_path, 'r' , encoding='utf-8') as f:
                text_data = f.read().replace('\n', '').replace(u'\xa0',' ')
            header_lst.append('text_content') 
            JSON_data.append(text_data)
            
            yield contract_id, JSON_converter(header_lst, JSON_data)
            
        except:
            print('cannot process txt_file {}'.format(file_name))

def process_dataSource(input_df):
    
    output_df = input_df.copy()
    output_df['Contract Duration (Days)'] = output_df['Contract Duration (Days)'].fillna('0')
    output_df['Contract Duration (Days)'] = output_df['Contract Duration (Days)'].apply(lambda x: int(re.sub('[^0-9]', '', x)))
    output_df['contract_id'] = output_df['contract_id'].astype(int)
    
    output_df['Effective Date'] = pd.to_datetime(output_df['Effective Date'], utc=True)
    output_df['Expiration Date'] = pd.to_datetime(output_df['Expiration Date'], utc=True)
    output_df['Effective Date'].fillna(datetime.datetime(2100,1,1), inplace = True)
    output_df['Expiration Date'].fillna(datetime.datetime(2100,1,1), inplace = True)
    #output_df['Effective Date'] = output_df['Effective Date'].strftime('%Y-%m-%d')
    #output_df['Expiration Date'] = output_df['Expiration Date'].strftime('%Y-%m-%d')                                                                      
    output_df.fillna('', inplace = True)
    
    return output_df

ContractMetaData = pd.read_csv('data/300_metadata.csv')
processed_data = process_dataSource(ContractMetaData)
contract_g = JSON_generator(processed_data)

In [9]:
ContractMetaData.head(2)

AttributeError: 'DataFrame' object has no attribute 'tolist'

In [7]:
processed_data.head(2)

Unnamed: 0,contract_id,Alternate Dispute Resolution Terms,Confidentiality Terms,Contract Duration (Days),Data Incident Response For Data Breaches Term,Effective Date,Expiration Date,Governing Law,Indemnity Terms,Jurisdiction,...,Signer 1,Signer 1 Title,Signer 2,Signer 2 Title,Termination Days,Termination Notice,Termination Rights Terms,Waiver Of Jury Trial Terms,Warranty Terms,file_name
0,8730,NO,YES,0,NO,2019-11-08 00:00:00+00:00,2100-01-01 00:00:00,the United States of America,YES,Texas,...,Clark R. Moore,EVP,Viktor Tkachev,General Counsel,1015.0,effective fifteen (15) days after Advisor s,YES,NO,NO,21697_ADVISORY_AGREEMENT (3).doc_20210524_1432...
1,8731,NO,YES,0,NO,2100-01-01 00:00:00,2100-01-01 00:00:00,Delaware,NO,Delaware,...,Timothy AGRIUM,CEO,Timothy B. Cabeootesti,,30.0,"end of a written 30 day notice/cure period, if",YES,NO,YES,21698_Agrium Nutrien Master Subscription Servi...


In [None]:
#text_template = {
#                "type": "text",
#                "analyzer": "standard",
#                "fields": {
#                    "keyword": {"type": "keyword"},
#                    "ngrams": {"type": "text", "analyzer": "ngram_analyzer"},
#                }}

In [None]:
text_template = {
                "type": "text",
                "analyzer": "standard",
                }
    
date_template = {"type": "date"}
    
numeric_template = {"type": "integer"}

def ESupdate_mapping(es, index_name, header_lst, remove_lst):
    for property_ in header_lst:
        if property_ in remove_lst:
            continue
            
        update_mapping = {}
        property_json = {}
        if property_ == 'Contract Duration (Days)':
            property_json[property_] = numeric_template
        elif property_ in ['Effective Date', 'Expiration Date']:
            property_json[property_] = date_template
        else:
            property_json[property_] = text_template
            
        update_mapping["properties"] = property_json
        updates = json.dumps(update_mapping, indent = 2)
        es.indices.put_mapping(updates, index_name)

index_name = "test-index2"
es.index(index=index_name , document={'author': 'lucas'})
test_lst = processed_data.columns.tolist() + ['text_content']
remove_lst = ['contract_id', 'file_name']
ESupdate_mapping(es, index_name , test_lst, remove_lst)

In [None]:
test = 'data/Txt files/21697_ADVISORY_AGREEMENT (3).doc_20210524_143234.txt'

with open(test, 'r' , encoding='utf-8') as f:
    text_data = f.read().replace('\n', '').replace(u'\xa0',' ')

In [None]:
for id_, data in contract_g:
    #print(id_,data)
    res = es.index(index=index_name, id = id_, body=data)

In [None]:
res

In [None]:
es.search(index=index_name, query={"match_all": {}})

In [None]:
es.indices.get_mapping(index_name)

In [None]:
help(es.indices)

In [None]:
res = es.search(index="test-index2", query={"match": {'text_content': 'Clark R. MooreName: Clark R'}})

In [None]:
res = es.search(index="test-index2", query={"range": {"Expiration Date": {"gte": "2020-01-01T00:00:00","lte": "now"}}})

In [None]:
search_res = res['hits']['hits']
print(len(search_res))

for item in search_res:
    print("id: {}, score: {}".format(item['_id'], item['_score']))
    print(item['_source']['Expiration Date'])

In [None]:
es.get(index="test-index2", id=8730)

In [None]:
es.transport.close()