# **4-Analyze & Query**
4th Try - Semantic Search + ....


In [1]:
import os
import csv
import json
import pandas as pd
import numpy as np
from PyPDF2 import PdfFileReader
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient

In [2]:
# ------------------------- Create an ES Client -------------------------
es_client = Elasticsearch(
    "localhost:9200",
    http_auth=["elastic", "changeme"]) 

# ---------------------- Create an ES Index Client ----------------------
es_index_client = IndicesClient(es_client)
type(es_index_client)

# ------------------- Define the Settings & Mappings --------------------
configurations = {
    "settings": {
        "index": {
            "number_of_replicas": 1},
        "analysis": {
            "filter": {
                "ngram_filter": {
                  "type": "edge_ngram",
                  "min_gram": 2,
                  "max_gram": 50}
            },
            "analyzer": {
                "ngram_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                      "lowercase",
                      "ngram_filter"]
                }  
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {
                "type": "text"},
            "label": {
                "type": "long"},
            "filename": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword"}
                }
            },
            "page": {
                "type": "long"},
            "text": {
                "type": "text",
                "analyzer": "standard",
                "fields": {
                    "keyword": {
                        "type": "keyword"},
                    "ngrams": {
                        "type": "text",
                        "analyzer": "ngram_analyzer"}
                }
            }
        }
    }
}
# ------------------------- Create an ES Index -------------------------
es_index_client.create(index="esg_report_4_by_page", settings=configurations["settings"], mappings=configurations["mappings"])

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'esg_report_4_by_page'}

In [None]:
# ------------------------- Bulk Load PDF pages into ES as individual Docs (pos only for testing) -------------------------
index_name = "esg_report_4_by_page"
pos_PDF_path = "Crawler & Processing/2.Develop - Crawler Folder/pos_reports_test/"
pos_PDF_file_list = os.listdir("Crawler & Processing/2.Develop - Crawler Folder/pos_reports_test")

action_list = []  

for fileName in pos_PDF_file_list:
    try:
        if os.path.isfile(os.path.join(pos_PDF_path, fileName)): ##### checking if it is a file, if yes, extract text and map info into JSON
            
            pdf_ID = int(fileName.split(".")[0]) ##### Get PDF id before ".csv"
            
            with open(pos_PDF_path+fileName, 'rb') as f:
                pdf = PdfFileReader(f)   ##### f is class: '_io.BufferedReader'
                for pn in range(1, pdf.getNumPages()): ##### Get all pages of a PDF
                    page = pdf.getPage(pn)   ###### Retrieves a page by number from this PDF file. Returns: a PageObject instance.
                    text = page.extractText()   ##### Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. 
                    doc_id_str = str(pdf_ID)+"."+str(pn)
                    
                    action = {"index": {"_index": index_name, "_id": doc_id_str}}
                    doc = { 
                        "id": doc_id_str,
                        "label": 1,
                        "filename": fileName,
                        "page": pn,
                        "text": text,
                        # 'keywords': text.split(" "),
                    }   
                    action_list.append(json.dumps(action))  
                    action_list.append(json.dumps(doc)) 
        
    except:
        continue
    
# ------------------------- Feed "action_list" into a JSON File -------------------------
with open("esg_report_4_by_page.json", "w") as write_file:
    write_file.write("\n".join(action_list))
# ------------------------- Feed the JSON File to ES - Bulk Upload!!! -------------------------
es_client.bulk(body="\n".join(action_list))

In [None]:
# Install tensorflow_text
# !pip install tensorflow_text #[DONE]

# Install tensorflow-hub
# !pip install "tensorflow>=2.0.0" #[DONE]
# !pip install --upgrade tensorflow-hub#[DONE]

In [4]:
#【1】Import the essential libraries：
# import tensorflow as tf
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub

In [5]:
#【2】Download the Universal Sentence Encoder Model (about 1 GB)：
graph = tf.Graph()

with tf.Session(graph = graph) as session:
    print("Downloading pre-trained embeddings from tensorflow hub…")
    embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
    text_ph = tf.placeholder(tf.string)
    embeddings = embed(text_ph)
    print("Done.")
    print("Creating tensorflow session…")
    
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    print("Done.")

2022-01-19 19:26:55.459782: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading pre-trained embeddings from tensorflow hub…
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Done.
Creating tensorflow session…
Done.


In [6]:
#【3】Define a function to use the model to convert the text to vector：
def embed_text(text):
    vectors = session.run(embeddings, feed_dict={text_ph: text})
    return [vector.tolist() for vector in vectors]

In [7]:
#【4】Define a source text and call the above function to convert the text to vector：
source_text="We show our efforts to help the green economy, creating business and value by using recycled plastic waste to produce polymers. We perform detailed analysis to evaluate the significance of working activities that influence the environment. Our Environmental policy is also defined in the engineering phase which is an opportunity to propose technological modifications which can result in energy saving and cleaner emissions, leading to environmental and economic benefits for the customer, stakeholders and the whole community. We have reduced our green house gas emissions. We are committed to promote decarbonization and better use of energy, continuously implements energy efficiency initiatives. Water consumption has been reduced and water has been recycled with innovative technologies. We have undertaken careful and comprehensive collection, transportation and final treatment of waste. Our digitalization of documents assists a paper-less approach which helps to reduce paper waste. We are in full environmental compliance. Negative impact on the environment has been reduced. During each audit we inspect environmental permits, waste management, and effluent treatment plants. Our suppliers and we have obtained environmental certifications. Our electricity mainly come from solar panels and wind power. We have reduced carbon (CO2 ) emissions."
text_vector = embed_text([source_text])[0]
print("Text to be embedded: {}".format(source_text))
print("Embedding size: {}".format(len(text_vector)))
print("Obtained Embedding[{},…]\n".format(text_vector[:5]))

# This is how to use the Universal Sentence Encoder model to obtain text embeddings.

Text to be embedded: We show our efforts to help the green economy, creating business and value by using recycled plastic waste to produce polymers. We perform detailed analysis to evaluate the significance of working activities that influence the environment. Our Environmental policy is also defined in the engineering phase which is an opportunity to propose technological modifications which can result in energy saving and cleaner emissions, leading to environmental and economic benefits for the customer, stakeholders and the whole community. We have reduced our green house gas emissions. We are committed to promote decarbonization and better use of energy, continuously implements energy efficiency initiatives. Water consumption has been reduced and water has been recycled with innovative technologies. We have undertaken careful and comprehensive collection, transportation and final treatment of waste. Our digitalization of documents assists a paper-less approach which helps to reduce

In [8]:
########################### Create ES Client and put text vectors into it ###########################
# # ------------------------- Create an ES Client -------------------------
# es_client = Elasticsearch(
#     "localhost:9200",
#     http_auth=["elastic", "changeme"]) 

# # ---------------------- Create an ES Index Client ----------------------
# es_index_client = IndicesClient(es_client)
# type(es_index_client)

# ------------------- Define the Settings & Mappings --------------------
configurations = {
    "settings": {
        "number_of_shards": 2,
        "number_of_replicas": 1
    },
    
    "mappings": {
        "dynamic": "true",
        "_source": {"enabled": "true"},
        "properties": {
            "emb_id": {
                "type": "text"
            },
            "emb_text_vector": {
                "type": "dense_vector",
                "dims": 512
            }
        }
    }
}
# ------------------------- Create an ES Index -------------------------
es_index_client.create(index="use_embedding", settings=configurations["settings"], mappings=configurations["mappings"])

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'use_embedding'}

In [None]:
################### Retriving selected docs from index="esg_report_4_by_page" & put text into index="use_embedding"
search_query = {
    "size": 1000,
    "query": {
    "terms": {
      "id": [8.87, 8.92, 8.94, 8.95, 8.96, 8.97, 9.2, 9.3, 9.23, 11.1, 11.12, 11.22]
    }
  }
}
result = es_client.search(index="esg_report_4_by_page", doc_type="_doc", body=search_query)
##### Checking the search results with Kibana results:
# res_data = result["hits"]["total"]["value"] #---->【DONE】
# res_text = result["hits"]["hits"][2]["_source"]["text"] #---->【DONE】
# print("------------ Check res_data has same number of docs: YES! Total doc number is: \n", res_data) #---->【DONE】
# print("------------ Check res_text matches the doc: YES! Text below matches that in Kibana: \n", res_text) #---->【DONE】
res_data = result["hits"]["hits"]

action_list = []  
for dict in res_data:
    id = dict["_id"]
    res_text = dict["_source"]["text"]
    res_text_vector = embed_text([res_text])[0]
       
    action = {"index": {"_index": "use_embedding", "_id": id}}
    doc = { 
        "emb_id": id,
        "emb_text_vector": res_text_vector
    }   
    action_list.append(json.dumps(action))  
    action_list.append(json.dumps(doc)) 
            

    
# ------------------------- Feed "action_list" into a JSON File -------------------------
with open("embedding_vector_test.json", "w") as write_file:
    write_file.write("\n".join(action_list))
# ------------------------- Feed the JSON File to ES - Bulk Upload!!! -------------------------
es_client.bulk(body="\n".join(action_list))


In [None]:
# ################# Check the vectors of a specific doc to see if bulk process is correct ------->【DONE】
# search_query = {
#     "size": 1000,
#     "query": {
#     "terms": {
#       "id": [8.87]
#     }
#   }
# }
# result = es_client.search(index="esg_report_4_by_page", doc_type="_doc", body=search_query)
# res_text = result["hits"]["hits"][0]["_source"]["text"]
# res_text_vector = embed_text([res_text])[0]
# print(res_text_vector)

In [49]:

print("---------- Text from the research result is: ----------\n", res_text)
res_text_vector = embed_text([res_text])[0]
print("---------- UCE-vectors of the text is: ----------\n", res_text_vector)

---------- Text from the research result is: ----------
 9595
95 WATER MANAGEMENT 
 The sources of water for sanitation 
and civil purposes (canteen, toilets, 
ˆushing wc–) used by the compa
-nies located in Maire Tecnimont 
Headquarters in Milan 
are provided via public 
supply network and via 
the rainwater collection 
systems on the roof.
The facilities of the 
complex use, without 
chemical changes, 
groundwater to feed 
Heating Ventilation Air 
Conditioning (HVAC) systems, both 
for heating and air conditioning. 
Waste water discharge in the ur
-ban sewerage network for which 
no discharge authorization is 
needed in accordance with the 
current local law. 
Over the last three years, total wa
-ter withdrawn from municipal wa
-ter supplies or other public or pri
-vate water utilities were: 53,234 m
3 in 2017, 41,145 m
3 in 2018 and 
47,544 m
3 in 2019. The same quan
-tity of water was discharged to 
sewers over the last three years.
In 2017, 1,404,060 m
3 of water 
were withdrawn (



In [10]:
# Define example text (a positive ESG report) and vectorize it
criteria_text = "We show our efforts to help the green economy, creating business and value by using recycled plastic waste to produce polymers. We perform detailed analysis to evaluate the significance of working activities that influence the environment. Our Environmental policy is also defined in the engineering phase which is an opportunity to propose technological modifications which can result in energy saving and cleaner emissions, leading to environmental and economic benefits for the customer, stakeholders and the whole community. We have reduced our green house gas emissions. We are committed to promote decarbonization and better use of energy, continuously implements energy efficiency initiatives. Water consumption has been reduced and water has been recycled with innovative technologies. We have undertaken careful and comprehensive collection, transportation and final treatment of waste. Our digitalization of documents assists a paper-less approach which helps to reduce paper waste. We are in full environmental compliance. Negative impact on the environment has been reduced. During each audit we inspect environmental permits, waste management, and effluent treatment plants. Our suppliers and we have obtained environmental certifications. Our electricity mainly come from solar panels and wind power. We have reduced carbon (CO2 ) emissions."
criteria_text_vec = embed_text([criteria_text])[0]

In [None]:
script_query = {
    "script_score": {
        "query": {
            "match_all": {}
        },
        "script": {
            "source": 
                "cosineSimilarity(params.query_vector, doc['emb_text_vector'])",
            "params": {
                "query_vector": 
                    criteria_text_vec 
            }
        }
    }
}

cos_similarity = es_client.search(index="use_embedding",body={"size": 10,"query": script_query,"_source": {"includes": ["emb_id"]}})


In [15]:
print(json.dumps(cos_similarity, indent=1))

{
 "took": 3,
 "timed_out": false,
 "_shards": {
  "total": 2,
  "successful": 2,
  "skipped": 0,
  "failed": 0
 },
 "hits": {
  "total": {
   "value": 12,
   "relation": "eq"
  },
  "max_score": 0.7986017,
  "hits": [
   {
    "_index": "use_embedding",
    "_type": "_doc",
    "_id": "11.12",
    "_score": 0.7986017,
    "_source": {
     "emb_id": "11.12"
    }
   },
   {
    "_index": "use_embedding",
    "_type": "_doc",
    "_id": "8.95",
    "_score": 0.7877541,
    "_source": {
     "emb_id": "8.95"
    }
   },
   {
    "_index": "use_embedding",
    "_type": "_doc",
    "_id": "9.2",
    "_score": 0.77635473,
    "_source": {
     "emb_id": "9.2"
    }
   },
   {
    "_index": "use_embedding",
    "_type": "_doc",
    "_id": "8.94",
    "_score": 0.7557802,
    "_source": {
     "emb_id": "8.94"
    }
   },
   {
    "_index": "use_embedding",
    "_type": "_doc",
    "_id": "8.96",
    "_score": 0.6847196,
    "_source": {
     "emb_id": "8.96"
    }
   },
   {
    "_index": "

In [25]:
df = pd.DataFrame(columns=['_id', 'use_score'])

for dict in cos_similarity["hits"]["hits"]: 
    # print(dict["_source"]["emb_id"])
    
    df = df.append({'_id': dict["_source"]["emb_id"], 'use_score': dict["_score"]}, ignore_index=True)
    
df
    

Unnamed: 0,_id,use_score
0,11.12,0.798602
1,8.95,0.787754
2,9.2,0.776355
3,8.94,0.75578
4,8.96,0.68472
5,11.1,0.666894
6,9.23,0.640782
7,8.97,0.544457
8,8.87,0.542631
9,8.92,0.52362
