# **Load Data into ES - Test-1**

In [1]:
import os
import csv
import json
import pandas as pd
import numpy as np


In [1]:
# ------------------------- Create an ES Client -------------------------
from elasticsearch import Elasticsearch
es_client = Elasticsearch(
    "localhost:9200",
    http_auth=["elastic", "changeme"], 
) 
# ------------------------- Create an ES Index Client -------------------------
from elasticsearch.client import IndicesClient
es_index_client = IndicesClient(es_client)
type(es_index_client)

elasticsearch.client.indices.IndicesClient

In [5]:
# ------------------------- Define the Settings & Mappings - Ver.1 -------------------------
configurations = {
  "settings": {
      "index": {
          "number_of_replicas": 1},
      "analysis": {
          "filter": {
              "ngram_filter": {
                "type": "edge_ngram",
                "min_gram": 2,
                "max_gram": 50}
          },
          "analyzer": {
              "ngram_analyzer": {
                  "type": "custom",
                  "tokenizer": "standard",
                  "filter": [
                    "lowercase",
                    "ngram_filter"]
              }
          }
      }
  },
  "mappings": {
        "properties": {
            "id": {
                "type": "long"},
            "label": {
                "type": "long"},
            "company": {
                "type": "text"},
            "industry": {
                "type": "text"},
            "country": {
                "type": "text"},
            "content": {
                "type": "nested",
                "properties": {
                    "page": {
                        "type": "long"},
                    "priority": {
                        "type": "float"},
                    "sentence": {
                        "type": "text",
                        "analyzer": "standard",
                        "fields": {
                            "keyword": {
                                "type": "keyword"},
                            "ngrams": {
                                "type": "text",
                                "analyzer": "ngram_analyzer"}
                        }
                    }
                }
            }
        }
    }
}

# ------------------------- Create an ES Index -------------------------
es_index_client.create(index="esg_report_1", body=configurations)

  es_index_client.create(index="esg_report_1", body=configurations)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'esg_report_1'}

In [None]:
# Reset Index with new configurations
# in Kibana:    DELETE esg_report_1 

In [144]:
# List all trimmed CSV files from the Crawler
pos_file_list = os.listdir("Crawler & Processing/2.Develop - Crawler Folder/preprocessed/trimmed/pos")
pos_file_list.sort()
print("Total number of positive CSV files:", len(pos_file_list))
print("Type of pos_file_list items:", type(pos_file_list[0]))
# print("Show the pos_file_list:", pos_file_list)

neg_file_list = os.listdir("Crawler & Processing/2.Develop - Crawler Folder/preprocessed/trimmed/neg")
neg_file_list.sort()
print("Total number of negative CSV files:", len(neg_file_list))
print("Type of pos_file_list items:", type(neg_file_list[0]))
# print("Show the pos_file_list:", neg_file_list)

Total number of positive CSV files: 196
Type of pos_file_list items: <class 'str'>
Total number of negative CSV files: 177
Type of pos_file_list items: <class 'str'>


In [135]:
####### For Positive reports only
column_names = ["id", "label", "company", "industry", "country", "date"]

df = pd.DataFrame(columns = column_names)

for file in file_list:
    
    # Get the file name without ".csv"
    pdf_ID = int(file.split(".")[0])
    # Positive label defined as 1
    pos_label = 1
    # Get the company info
    matched_row = com_details[com_details["Unique ID"]==pdf_ID].iloc[0] # Find matched row and get the first row
    company_name = matched_row["Issuer - subsidiary"]
    industry_name = matched_row["Issuer industry"]
    country_name = matched_row["Country of risk"]
    date_str = str(matched_row["Date"]).split(" ")[0]
    
    df = df.append({"id": pdf_ID, 
                    "label": pos_label, 
                    "company": company_name, 
                    "industry": industry_name, 
                    "country": country_name,
                    "date": date_str}, ignore_index=True)

df


Unnamed: 0,id,label,company,industry,country,date
0,11,1,Aurubis AG,Materials,Germany,2020-06-10
1,8,1,Maire Tecnimont SpA,Industrials,Italy,2019-12-04
2,9,1,Faber-Castell AG,Consumer discretionary,Germany,2020-02-12


In [None]:
for fileName in file_list:
    print(fileName)


In [35]:
columns = ["page", "priority", "content"]
index_name = "esg_report_1"

for fileName in file_list:
    action_list = []  
    
    action = {"index": {"_index": index_name, "_id": int(row["id"])}}  
    
    with open("Crawler & Processing/2.Develop - Crawler Folder/preprocessed/trimmed/pos_test/" + fileName, "r") as f:
        csv_file = csv.DictReader(f, fieldnames=columns, delimiter=",", quotechar='"') 
        
        next(csv_file)  

        
        
        for row in csv_file:
            
            doc = {
                    "id": int(row["id"]), 
                    "name": row["name"],
                    "price": float(row["price"]),
                    "brand": row["brand"],
                    "attributes": [
                                    {"attribute_name": "cpu", "attribute_value": row["cpu"]},
                                    {"attribute_name": "memory", "attribute_value": row["memory"]},
                                    {"attribute_name": "storage", "attribute_value": row["storage"],},
                                    ],
                    }
            action_list.append(json.dumps(action))  
            action_list.append(json.dumps(doc)) 
            print(row["page"])
    
    
    
    

page
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
15
15
15
15
15
15
15
15
15
15
16
16
16
16
16
16
16
16
page
20
20
20
20
20
20
20
20
20
20
20
20
20
20
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
22
22
22
22
22
22
22
22
22
22
22
22
22
22
22
22
22
22
22
22
23
23
23
23
23
23
23
23
23
23
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
25
25
25
25
25
25
25
25
25
25
25
25
25
25
25
25
25
25
25
25
26
26
26
26
26
26
26
26
26
26
26
26
26
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
page
23
23
23
23
23
23
23
23
23
23
23
23
23
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
24
2

In [None]:
for i in numFiles:
    file = open(os.path.join(pathName, i), "rU")
    reader = csv.reader(file, delimiter=',')
    for row in reader:
        for column in row:
            print(column)
            if column=="SPECIFIC VALUE":

In [None]:
# ------------------------- Read Data and Bulk Process Docs -------------------------
colunms = ["id", "name", "price", "brand", "cpu", "memory", "storage"]
index_name = "laptops-demo"

with open("csv_files/laptops_demo.csv", "r") as f:
    reader = csv.DictReader(f, fieldnames=colunms, delimiter=",", quotechar='"') 
    
    next(reader)  

    action_list = []  
    
    for row in reader:
        action = {"index": {"_index": index_name, "_id": int(row["id"])}}  
        doc = {
                "id": int(row["id"]), 
                "name": row["name"],
                "price": float(row["price"]),
                "brand": row["brand"],
                "attributes": [
                                {"attribute_name": "cpu", "attribute_value": row["cpu"]},
                                {"attribute_name": "memory", "attribute_value": row["memory"]},
                                {"attribute_name": "storage", "attribute_value": row["storage"],},
                                ],
                }
        action_list.append(json.dumps(action))  
        action_list.append(json.dumps(doc)) 