In [25]:
from elasticsearch import Elasticsearch, helpers
from pprint import pp
from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm
import os
load_dotenv("elastic-start-local/.env")
ES_LOCAL_API_KEY = os.environ.get("ES_LOCAL_API_KEY")
ES_LOCAL_URL = os.environ.get("ES_LOCAL_URL")

def ppr(resp):
    pp(resp.raw)

In [2]:
es = Elasticsearch([ES_LOCAL_URL], api_key=ES_LOCAL_API_KEY)

In [33]:
es.indices.delete(index='html_posts')

es.indices.create(
    index='html_posts',
    settings={
        "analysis": {
            "analyzer": {
                "html_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "char_filter": [
                        "html_strip"
                    ]
                }
            }
        }
    }, 
    mappings={
        "properties": {
            "Body": {
                "type": "text",
                "analyzer": "html_analyzer"
            },
            "CommentCount": {
                "type": "integer"
            },
            "CreationDate": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss.SSS"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'html_posts'})

In [34]:
def generator(json_chunk):
    for line in json_chunk:
        yield {
            "_index": "html_posts",
            "_id": line.get('Id'),
            "_source": {
                "Body": line.get("Body",""),
                "CommentCount": line.get("CommentCount",""),
                "CreationDate": line.get("CreationDate","")
            }
        }


def from_chunk_to_es(chunk):
    json_chunk = chunk.to_dict("records")
    return generator(json_chunk)

In [35]:

columns = ['Id', 'Body', 'CommentCount', "CreationDate"]
i = 0 
for chunk in tqdm(pd.read_csv('StackOverflowMini_dbo_Posts.csv', chunksize=1000)):
    gen = from_chunk_to_es(chunk[columns])
    res = helpers.bulk(es, gen)

0it [00:00, ?it/s]

1566it [03:39,  7.14it/s]


In [48]:
# effects of analyzer
resp = es.search(
    index="html_posts",
    size="1",
    query={
        "match": {
            "Body": {
                "query": "I want to"
            }
        }
    }
)

ppr(resp)

resp = es.search(
    index="html_posts",
    size="1",
    query={
        "match": {
            "Body": {
                "query": "<p>"
            }
        }
    }
)

ppr(resp)

{'took': 6,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
          'max_score': 5.2730217,
          'hits': [{'_index': 'html_posts',
                    '_id': '1859507',
                    '_score': 5.2730217,
                    '_source': {'Body': '<p>I want to display a number in a '
                                        'report, however I only want to show '
                                        'any decimal points if they are '
                                        'present and the I only want to show 1 '
                                        'decimal space.</p>\n'
                                        '\n'
                                        '<p>e.g. if the number is 12 then I '
                                        'want to show 12</p>\n'
                                        '\n'
                                        '<p>If the number is 12.1 then I want