# **Load Data into ES - 1**
1st try to bulk load data into ElasticSearch Index: using manually labeled and trimmed CSV files with each row being a nested field in the "content" property

In [1]:
import os
import csv
import json
import pandas as pd
import numpy as np


In [2]:
# ------------------------- Create an ES Client -------------------------
from elasticsearch import Elasticsearch
es_client = Elasticsearch(
    "localhost:9200",
    http_auth=["elastic", "changeme"], 
) 
# ---------------------- Create an ES Index Client ----------------------
from elasticsearch.client import IndicesClient
es_index_client = IndicesClient(es_client)
type(es_index_client)

elasticsearch.client.indices.IndicesClient

In [29]:
# ------------------- Define the Settings & Mappings --------------------
configurations = {
    "settings": {
        "index": {
            "number_of_replicas": 1},
        "analysis": {
            "filter": {
                "ngram_filter": {
                  "type": "edge_ngram",
                  "min_gram": 2,
                  "max_gram": 50}
            },
            "analyzer": {
                "ngram_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                      "lowercase",
                      "ngram_filter"]
                }  
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {
                "type": "long"},
            "label": {
                "type": "long"},
            "company": {
                "type": "text"},
            "industry": {
                "type": "text"},
            "country": {
                "type": "text"},
            "date": {
                "type": "text"},
            "content": {
                "type": "nested",
                "properties": {
                    "page": {
                        "type": "long"},
                    "priority": {
                        "type": "float"},
                    "words": {
                        "type": "text",
                        "analyzer": "standard",
                        "fields": {
                            "keyword": {
                                "type": "keyword"},
                            "ngrams": {
                                "type": "text",
                                "analyzer": "ngram_analyzer"}
                        }
                    }
                }
            }
        }
    }
}
# ------------------------- Create an ES Index -------------------------

es_index_client.create(index="esg_report_1", settings=configurations["settings"], mappings=configurations["mappings"])

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'esg_report_1'}

In [None]:
# ------------------- DELETE Existing Index --------------------
# in Kibana:    DELETE esg_report_1 

In [13]:
# ------------------- List all raw CSV files Crawled and Processed using ING scripts --------------------
# List all Positive CSVs
pos_CSV_list = os.listdir("Crawler & Processing/2.Develop - Crawler Folder/pos_csv_test")
pos_CSV_list.sort()
print("Total number of positive CSV files:", len(pos_CSV_list))
print("Type of pos_file_list items:", type(pos_CSV_list[0]))
print(pos_CSV_list)
pos_CSV_list.pop(0)
pos_CSV_list.pop(1)
print(pos_CSV_list)
# print("Show the pos_CSV_list:", pos_CSV_list)

# List all Negative CSVs
    # neg_CSV_list = os.listdir("Crawler & Processing/2.Develop - Crawler Folder/neg_csv")
    # neg_CSV_list.sort()
    # print("Total number of negative CSV files:", len(neg_CSV_list))
    # print("Type of pos_file_list items:", type(neg_CSV_list[0]))
# print("Show the neg_CSV_list:", neg_CSV_list)

Total number of positive CSV files: 5
Type of pos_file_list items: <class 'str'>
['.DS_Store', '11.csv', '3-row version', '8.csv', '9.csv']
['11.csv', '8.csv', '9.csv']


In [14]:
# ------------------- Append Company-specific Data --------------------
# Load companies.xlsx for company info
com_details = pd.read_excel("Crawler & Processing/2.Develop - Crawler Folder/companies.xlsx")

# Display df info
com_details.head()

Unnamed: 0,Unique ID,Issuer - subsidiary,Issuer industry,Country of risk,Date
0,3,Cagayan Electric Power & Light Co Inc,Energy,Philippines,2001-05-30
1,16478,Nanjing Financial City Construction & Developm...,Financials,China,2021-04-30
2,16481,Suzhou Tech City Development Group Co Ltd,Financials,China,2021-04-30
3,16479,Landesbank Baden-Wuerttemberg,Financials,Germany,2021-04-30
4,16480,City of Lunds Sweden,Government,Sweden,2021-04-30


In [30]:
# ------------------------- Create 3 Docs for Positive CSVs -------------------------
index_name = "esg_report_1"
action_list = []  

for fileName in pos_CSV_list:
    pdf_ID = int(fileName.split(".")[0])
    matched_row = com_details[com_details["Unique ID"]==pdf_ID].iloc[0] 
    
    action = {"index": {"_index": index_name, "_id": pdf_ID}}  
    doc = { 
        "id": pdf_ID, 
        "label": 1,
        "company": matched_row["Issuer - subsidiary"],
        "industry": matched_row["Issuer industry"],
        "country": matched_row["Country of risk"],
        "date": str(matched_row["Date"]).split("-")[0],
        "content":[]
    }
    action_list.append(json.dumps(action))  
    action_list.append(json.dumps(doc)) 

# ------------------------- Feed the JSON File to ES - Bulk Upload!!! -------------------------
with open("esg_report_1.json", "w") as write_file:
    write_file.write("\n".join(action_list))
    
es_client.bulk(body="\n".join(action_list))



{'took': 2,
 'errors': False,
 'items': [{'index': {'_index': 'esg_report_1',
    '_type': '_doc',
    '_id': '11',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'esg_report_1',
    '_type': '_doc',
    '_id': '8',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'esg_report_1',
    '_type': '_doc',
    '_id': '9',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1,
    'status': 201}}]}

In [18]:
with open("Crawler & Processing/2.Develop - Crawler Folder/pos_csv_test/9.csv", "r") as f:
    csv_file = csv.DictReader(f, fieldnames=columns, delimiter=",", quotechar='"')
    
    next(csv_file)
    
    for row in csv_file:
        print(row["page"])

0
1
1


In [31]:
columns = ["page", "priority", "content"]
            
for fileName in pos_CSV_list:
    pdf_ID = int(fileName.split(".")[0])

    with open("Crawler & Processing/2.Develop - Crawler Folder/pos_csv_test/" + fileName, "r") as f:
        csv_file = csv.DictReader(f, fieldnames=columns, delimiter=",", quotechar='"')
        
        next(csv_file)

        for row in csv_file:                 #################【invalid literal for int() with base 10: ''】
            update_query = {
                "script": {
                    "source": "ctx._source.content.add(params.content)",
                    "params": {
                        "content": {
                            "page" : int(row["page"]),
                            "priority" : float(row["priority"]),
                            "words" : row["content"]
                        }
                    }
                } 
            }
            
            es_client.update(index="esg_report_1", id=pdf_ID, body=update_query)  
    
    # df = pd.read_csv("Crawler & Processing/2.Develop - Crawler Folder/pos_csv_test/" + fileName)
    # df.dropna(axis=0, inplace=True)
    # display(df.shape)

    # for i in df.index:
    #     print(df.iloc[i, 0])
    #     update_query = {
    #         "script": {
    #             "source": "ctx._source.content.add(params.content)",
    #             "params": {
    #                 "content": {
    #                     "page" : int(df.iloc[i, 0]),
    #                     "priority" : float(df.iloc[i, 1]),
    #                     "words" : df.iloc[i, 2]
    #                 }
    #             }
    #         } 
    #     }
        # es_client.update(index="esg_report_1", id=pdf_ID, body=update_query)   


  es_client.update(index="esg_report_1", id=pdf_ID, body=update_query)
