In [1]:
# Import relevant libraries
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [28]:
# Read the monarchs csv file
monarchs_df = pd.read_csv('../corpus/srilankanMonarchs.csv', encoding="utf-8")

## Data preprocessing

In [29]:
# Check for null values
monarchs_df.isnull().sum()

name             0
detail         103
spouse         166
kingdom          0
dynasty          0
reign_start      0
reign_end        0
predecessor    105
successor      107
dtype: int64

In [30]:
def preprocess_data(df):

    # Replace null with empty strings
    df = df.fillna("")

    # Replace unrecognized characters
    df = df.replace("\u200d",'', regex=True)

    def preprocess_date(x):
        # If x(timeline) is B.C then consider it to be a negative value else a positive value. 
        if ("ක්රි.පූ." in x ):
            x=x.replace("ක්රි.පූ.","-")
        if ("ක්රි.ව." in x ):
            x=x.replace("ක්රි.ව.","")
        
        if (x!=""):
            return eval(x)
        return x
    
    # Convert timeline to positive and negative integers
    df['reign_start'] = df['reign_start'].apply(preprocess_date)
    df['reign_end'] = df['reign_end'].apply(preprocess_date)
    
    return df


In [31]:
# Preprocess the monarchs data 
monarchs_df = preprocess_data(monarchs_df)

## Connect and upload data to elastic search localhost server

In [32]:
# Declare an instance of the Python Elasticsearch library
ENDPOINT = 'http://localhost:9200/'
es = Elasticsearch(timeout = 600, hosts=ENDPOINT)

In [33]:
# Check for the connection
es.ping()

True

In [34]:
# Convert the records to json
monarchs_dict = monarchs_df.to_dict('records')

In [35]:

# Convert data into ELK format
def generator(df):
    for c, line in enumerate(df):
        yield{
            '_index': 'monarchs',
            '_type':'_doc',
            '_id' : c,
            '_source': {
                'name': line.get('name', ''),
                'detail': line.get('detail', ''),
                'spouse' : line.get('spouse', ''),
                'kingdom': line.get('kingdom', ''),
                'dynasty': line.get('dynasty', ''),
                'reign_start': line.get('reign_start', ''),
                'reign_end' : line.get('reign_end', ''),
                'predecessor': line.get('predecessor', ''),
                'successor': line.get('successor', '')
            }
        }
    # raise StopIteration

In [38]:
# Define the mappings for the elastic search index
Settings = {
    "settings": {
       "index": {
          "number_of_shards": 1,
          "number_of_replicas": 1
       },
       "analysis": {
          "analyzer": { 

            "sin_analyzer": {
                "type": "custom",
                "tokenizer": "icu_tokenizer",
                "char_filter": ["punctuation_filter"],
                "filter": ["sin_stopwords","sin_synonyms", "sin_stemmer" ,"edge_ngram_filter"]
            },
            "sin_search_analyzer" : {
                "type": "custom",
                "tokenizer": "icu_tokenizer",
                "char_filter":["punctuation_filter"], 
                "filter":["sin_stopwords","sin_synonyms", "sin_stemmer"]
            }
         },

         "char_filter": {
            "punctuation_filter":{
               "type":"mapping",
               "mappings":[".=>",":=>","|=>","-=>","_=>","'=>","/=>",",=>"]
            }
         },

         "filter": {
               "edge_ngram_filter": {
                    "type" : "edge_ngram",
                    "min_gram":"3",
                    "max_gram":"20",
                    "side":"front"
               },
               "sin_stemmer": {
                  "type": "hunspell",
                  "locale": "si_LK"
               },
               "sin_stopwords":{
                   "type":"stop",
                   "stopwords_path": "analyzers/stopwords.txt"
               },
               "sin_synonyms":{
                   "type": "synonym",
                   "synonyms_path": "analyzers/synonym.txt"
               }
         }
       }
    },

    "mappings": {
        'properties' : {
          'name': {
              'type': 'text' ,
              "analyzer":"sin_analyzer",
              "search_analyzer": "sin_search_analyzer"
          },
          'detail' : {
              'type' : 'text',
              "analyzer":"sin_analyzer",
              "search_analyzer": "sin_search_analyzer"
          },
          'spouse': {
              'type': 'text',
              "analyzer": "sin_analyzer",
              "search_analyzer": "sin_search_analyzer"
          },
          'kingdom' : {
              'type' : 'text', 
              "analyzer": "sin_analyzer",
              "search_analyzer": "sin_search_analyzer",
              "fields" : {
                  "keyword" : {
                      "type" : "keyword",
                      "ignore_above" : 256
                  }
              }
          },
          'dynasty': {
              'type': 'text' , 
              "analyzer": "sin_analyzer",
              "search_analyzer": "sin_search_analyzer",
              "fields" : {
                  "keyword" : {
                      "type" : "keyword",
                      "ignore_above" : 256
                  }
              }
          },
          'reign_start' : {
              'type' : 'short', 
          },
          'reign_end': {
              'type': 'short' , 
          },
          'predecessor' : {
              'type' : 'text', 
              "analyzer": "sin_analyzer",
                "search_analyzer": "sin_search_analyzer",
                "fields" : {
                  "keyword" : {
                      "type" : "keyword",
                      "ignore_above" : 256
                  }
              }
          },
          'successor' : {
              'type' : 'text', 
              "analyzer": "sin_analyzer",
                "search_analyzer": "sin_search_analyzer",
                "fields" : {
                  "keyword" : {
                      "type" : "keyword",
                      "ignore_above" : 256
                  }
              }
          }
        } 
    }
  
}


In [39]:
# Create the index
indexName = 'monarchs'
es.indices.create(index = indexName, body=Settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'monarchs'}

In [40]:
# Upload the data in bulk
try:
    res = helpers.bulk(es,generator(monarchs_dict))
    print('working')
except Exception as e:
    pass
    print(e)

working
