# **Load Data into ES - 3**
3rd Try

In [1]:
import os
import csv
import json
import pandas as pd
import numpy as np
from PyPDF2 import PdfFileReader

In [2]:
# ------------------------- Create an ES Client -------------------------
from elasticsearch import Elasticsearch
es_client = Elasticsearch(
    "localhost:9200",
    http_auth=["elastic", "changeme"]) 

# ---------------------- Create an ES Index Client ----------------------
from elasticsearch.client import IndicesClient
es_index_client = IndicesClient(es_client)
type(es_index_client)

# ------------------- Define the Settings & Mappings --------------------
configurations = {
    "settings": {
        "index": {
            "number_of_replicas": 1},
        "analysis": {
            "filter": {
                "ngram_filter": {
                  "type": "edge_ngram",
                  "min_gram": 2,
                  "max_gram": 50}
            },
            "analyzer": {
                "ngram_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                      "lowercase",
                      "ngram_filter"]
                }  
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {
                "type": "text"},
            "label": {
                "type": "long"},
            "filename": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword"}
                }
            },
            "page": {
                "type": "long"},
            "text": {
                "type": "text",
                "analyzer": "standard",
                "fields": {
                    "keyword": {
                        "type": "keyword"},
                    "ngrams": {
                        "type": "text",
                        "analyzer": "ngram_analyzer"}
                }
            },
            "keywords": {
                "type": "text",
                "analyzer": "standard",
                "fields": {
                    "keyword": {
                        "type": "keyword"},
                    "ngrams": {
                        "type": "text",
                        "analyzer": "ngram_analyzer"}
                }
            }
        }
    }
}
# ------------------------- Create an ES Index -------------------------
es_index_client.create(index="esg_report_3_by_page", settings=configurations["settings"], mappings=configurations["mappings"])

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'esg_report_2_by_page'}

In [16]:
# ------------------------- Bulk Load PDF pages into ES as individual Docs (pos only for testing) -------------------------
index_name = "esg_report_2_by_page"
pos_PDF_path = "Crawler & Processing/2.Develop - Crawler Folder/pos_reports_test/"
pos_PDF_file_list = os.listdir("Crawler & Processing/2.Develop - Crawler Folder/pos_reports_test")

action_list = []  

for fileName in pos_PDF_file_list:
    try:
        if os.path.isfile(os.path.join(pos_PDF_path, fileName)): ##### checking if it is a file, if yes, extract text and map info into JSON
            
            pdf_ID = int(fileName.split(".")[0]) ##### Get PDF id before ".csv"
            
            with open(pos_PDF_path+fileName, 'rb') as f:
                pdf = PdfFileReader(f)   ##### f is class: '_io.BufferedReader'
                for pn in range(1, pdf.getNumPages()): ##### Get all pages of a PDF
                    page = pdf.getPage(pn)   ###### Retrieves a page by number from this PDF file. Returns: a PageObject instance.
                    text = page.extractText()   ##### Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. 
                    doc_id_str = str(pdf_ID)+"."+str(pn)
                    
                    action = {"index": {"_index": index_name, "_id": doc_id_str}}
                    doc = { 
                        "id": doc_id_str,
                        "label": 1,
                        "filename": fileName,
                        "page": pn,
                        "text": text,
                        'keywords': text.split(" "),
                    }   
                    action_list.append(json.dumps(action))  
                    action_list.append(json.dumps(doc)) 
        
    except:
        continue
    
# ------------------------- Feed "action_list" into a JSON File -------------------------
with open("esg_report_2_by_page.json", "w") as write_file:
    write_file.write("\n".join(action_list))
# ------------------------- Feed the JSON File to ES - Bulk Upload!!! -------------------------
es_client.bulk(body="\n".join(action_list))



{'took': 261,
 'errors': False,
 'items': [{'index': {'_index': 'esg_report_2_by_page',
    '_type': '_doc',
    '_id': '9.1',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'esg_report_2_by_page',
    '_type': '_doc',
    '_id': '9.2',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'esg_report_2_by_page',
    '_type': '_doc',
    '_id': '9.3',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'esg_report_2_by_page',
    '_type': '_doc',
    '_id': '9.4',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no':