# **Load Data into ES - 3**
3rd Try - Load entire page's text as one doc, and test retrieving results from ES search results

In [1]:
import os
import csv
import json
import pandas as pd
import numpy as np
from PyPDF2 import PdfFileReader

In [2]:
# ------------------------- Create an ES Client -------------------------
from elasticsearch import Elasticsearch
es_client = Elasticsearch(
    "localhost:9200",
    http_auth=["elastic", "changeme"]) 

# ---------------------- Create an ES Index Client ----------------------
from elasticsearch.client import IndicesClient
es_index_client = IndicesClient(es_client)
type(es_index_client)

# ------------------- Define the Settings & Mappings --------------------
configurations = {
    "settings": {
        "index": {
            "number_of_replicas": 1},
        "analysis": {
            "filter": {
                "ngram_filter": {
                  "type": "edge_ngram",
                  "min_gram": 2,
                  "max_gram": 50}
            },
            "analyzer": {
                "ngram_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                      "lowercase",
                      "ngram_filter"]
                }  
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {
                "type": "text"},
            "label": {
                "type": "long"},
            "filename": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword"}
                }
            },
            "page": {
                "type": "long"},
            "text": {
                "type": "text",
                "analyzer": "standard",
                "fields": {
                    "keyword": {
                        "type": "keyword"},
                    "ngrams": {
                        "type": "text",
                        "analyzer": "ngram_analyzer"}
                }
            }
        }
    }
}
# ------------------------- Create an ES Index -------------------------
es_index_client.create(index="esg_report_3_by_page", settings=configurations["settings"], mappings=configurations["mappings"])

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'esg_report_3_by_page'}

In [None]:
# ------------------------- Bulk Load PDF pages into ES as individual Docs (pos only for testing) -------------------------
index_name = "esg_report_3_by_page"
pos_PDF_path = "Crawler & Processing/2.Develop - Crawler Folder/pos_reports_test/"
pos_PDF_file_list = os.listdir("Crawler & Processing/2.Develop - Crawler Folder/pos_reports_test")

action_list = []  

for fileName in pos_PDF_file_list:
    try:
        if os.path.isfile(os.path.join(pos_PDF_path, fileName)): ##### checking if it is a file, if yes, extract text and map info into JSON
            
            pdf_ID = int(fileName.split(".")[0]) ##### Get PDF id before ".csv"
            
            with open(pos_PDF_path+fileName, 'rb') as f:
                pdf = PdfFileReader(f)   ##### f is class: '_io.BufferedReader'
                for pn in range(1, pdf.getNumPages()): ##### Get all pages of a PDF
                    page = pdf.getPage(pn)   ###### Retrieves a page by number from this PDF file. Returns: a PageObject instance.
                    text = page.extractText()   ##### Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. 
                    doc_id_str = str(pdf_ID)+"."+str(pn)
                    
                    action = {"index": {"_index": index_name, "_id": doc_id_str}}
                    doc = { 
                        "id": doc_id_str,
                        "label": 1,
                        "filename": fileName,
                        "page": pn,
                        "text": text,
                        # 'keywords': text.split(" "),
                    }   
                    action_list.append(json.dumps(action))  
                    action_list.append(json.dumps(doc)) 
        
    except:
        continue
    
# ------------------------- Feed "action_list" into a JSON File -------------------------
with open("esg_report_3_by_page.json", "w") as write_file:
    write_file.write("\n".join(action_list))
# ------------------------- Feed the JSON File to ES - Bulk Upload!!! -------------------------
es_client.bulk(body="\n".join(action_list))

In [4]:
search_query = {
    "query": {
    "terms": {
      "id": [8.2]
    }
  }
}
result = es_client.search(index="esg_report_3_by_page", body=search_query)

# print(json.dumps(result, indent=1))

##### Check the data types of all items in "results":
print('---"Result" is a {}, including items:'.format(type(result)))
for key in result.keys():
    print('"{}" is of type:'.format(key), type(result[key]))
print('------------ End of Level-1 ------------\n')

print('----- "Result["_shards"]" is a {}, including items:'.format(type(result["_shards"])))
for key in result["_shards"].keys():
    print('"{}" is of type:'.format(key), type(result["_shards"][key]))

print('----- "Result["hits"]" is a {}, including items:'.format(type(result["hits"])))
for key in result["hits"].keys():
    print('"{}" is of type:'.format(key), type(result["hits"][key]))
print('------------ End of Level-2 ------------\n')

print('----- "Result["hits"]["total"]" is a {}, including items:'.format(type(result["hits"]["total"])))
for key in result["hits"]["total"].keys():
    print('"{}" is of type:'.format(key), type(result["hits"]["total"][key]))
    
print('----- "Result["hits"]["hits"]" is a {}, including items:'.format(type(result["hits"]["hits"])))
# print(json.dumps(result["hits"]["hits"], indent=1))
print('This list has {} {}, which is the main body of the data, as seen below:'.format(len(result["hits"]["hits"]), 
                                                                                             type(result["hits"]["hits"][0])))
for l in result["hits"]["hits"]:
    print(json.dumps(l, indent=1), "------------ End of list")
print('------------ End of Level-3 ------------')





---"Result" is a <class 'dict'>, including items:
"took" is of type: <class 'int'>
"timed_out" is of type: <class 'bool'>
"_shards" is of type: <class 'dict'>
"hits" is of type: <class 'dict'>
------------ End of Level-1 ------------

----- "Result["_shards"]" is a <class 'dict'>, including items:
"total" is of type: <class 'int'>
"successful" is of type: <class 'int'>
"skipped" is of type: <class 'int'>
"failed" is of type: <class 'int'>
----- "Result["hits"]" is a <class 'dict'>, including items:
"total" is of type: <class 'dict'>
"max_score" is of type: <class 'float'>
"hits" is of type: <class 'list'>
------------ End of Level-2 ------------

----- "Result["hits"]["total"]" is a <class 'dict'>, including items:
"value" is of type: <class 'int'>
"relation" is of type: <class 'str'>
----- "Result["hits"]["hits"]" is a <class 'list'>, including items:
This list has 1 <class 'dict'>, which is the main body of the data, as seen below:
{
 "_index": "esg_report_3_by_page",
 "_type": "_doc

## **ES Search Results: Data Type & Accessibility in Python**

* **Result:**   (Dict)
    * "took"   (int)
    * "timed_out"   (Bool)
    * **"_shards":**   (Dict)
        * "total"   (int)
        * "successful"   (int)
        * "skipped"   (int)
        * "failed"   (int)
    * **"hits":**   (Dict)
        * "total"   (Dict)
            * "value"   ( int)
            * "relation"   (str)
        * "max_score"   (float)
        * **"hits":**   (list of 1 Dict )
            * "_index": "esg_report_3_by_page"
            * "_type": "_doc"
            * "_id": "8.2"
            * "_score": 1.0
            * **"_source":** { (Dict)
                * "id": "8.2"
                * "label": 1
                * "filename": "8.pdf"
                * "page": 2
                * "text": "........"
                * "keywords": ["...", "...", ...]

</br>
* 【"Pretty" print of ES results】: """" j = json.dumps  (result, indent=1) """

In [126]:
search_query = {
    "query": {
    "terms": {
      "id": [8.2]
    }
  }
}
result = es_client.search(index="esg_report_3_by_page", body=search_query)



In [142]:
print(result["hits"], "\n")
print(result["hits"]["hits"], "\n")
#result["hits"]["hits"] is a list of 1 Dict, there must be a "[0]" to index the item out
res_data = result["hits"]["hits"][0]["_source"]
print(res_data, "\n")
print(type(res_data), "\n")

print(list(res_data.keys()))
print(list(res_data.values()))

df = pd.DataFrame(res_data, index=[0])
df.head()

{'total': {'value': 1, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'esg_report_3_by_page', '_type': '_doc', '_id': '8.2', '_score': 1.0, '_source': {'id': '8.2', 'label': 1, 'filename': '8.pdf', 'page': 2, 'text': 'TALKING ABOUT SUSTAINABILITY \nMEANS FOR US TALKING \n ABOUT OUR VALUES.\n  BEING RESPECTFUL, \n BEING RESPONSIBLE,\n CREATING VALUE.\n'}}]} 

[{'_index': 'esg_report_3_by_page', '_type': '_doc', '_id': '8.2', '_score': 1.0, '_source': {'id': '8.2', 'label': 1, 'filename': '8.pdf', 'page': 2, 'text': 'TALKING ABOUT SUSTAINABILITY \nMEANS FOR US TALKING \n ABOUT OUR VALUES.\n  BEING RESPECTFUL, \n BEING RESPONSIBLE,\n CREATING VALUE.\n'}}] 

{'id': '8.2', 'label': 1, 'filename': '8.pdf', 'page': 2, 'text': 'TALKING ABOUT SUSTAINABILITY \nMEANS FOR US TALKING \n ABOUT OUR VALUES.\n  BEING RESPECTFUL, \n BEING RESPONSIBLE,\n CREATING VALUE.\n'} 

<class 'dict'> 

['id', 'label', 'filename', 'page', 'text']
['8.2', 1, '8.pdf', 2, 'TALKING ABOUT SUSTAINABILITY \nMEAN

Unnamed: 0,id,label,filename,page,text
0,8.2,1,8.pdf,2,TALKING ABOUT SUSTAINABILITY \nMEANS FOR US TA...


In [143]:
search_query = {
    "query": {
    "terms": {
      "id": [8.1, 8.2, 8.3, 9.1, 9.2, 9.3, 11.1, 11.2, 11.3]
    }
  }
}
result = es_client.search(index="esg_report_3_by_page", body=search_query)



In [162]:

#result["hits"]["hits"] is a list of 9 Dict, needs to use "for" loop
res_data = result["hits"]["hits"] 
# print(type(res_data), "\n")
# print(json.dumps(res_data, indent=1), "\n")

# print(list(res_data[0].keys()))
df = pd.DataFrame(columns = list(res_data[0].keys()))

for i in res_data:
    # print(type(i), i, "\n")
    df = pd.DataFrame(i, index=[0])

df

['_index', '_type', '_id', '_score', '_source']


Unnamed: 0,_index,_type,_id,_score,_source
