In [1]:
def index_map(type_name):
    
    text_mapping = {
      "type": "text",
      "fields": {
        "keyword": {
          "type": "keyword",
          "ignore_above": 256
        }
      }
    }
    
    n_gram = {
      "type": "text",
      "analyzer": "autocomplete",
      "search_analyzer": "autocomplete_search",
      "fields": {
        "keyword": {
          "type": "keyword",
          "ignore_above": 256
        }
      }
    }
    
    _index_mappings = {
      "settings": {
        "analysis": {
          "analyzer": {
            "autocomplete": {
              "tokenizer": "autocomplete",
              "filter": [
                "lowercase"
              ]
            },
            "autocomplete_search": {
              "tokenizer": "lowercase"
            }
          },
          "tokenizer": {
            "autocomplete": {
              "type": "edge_ngram",
              "min_gram": 4,
              "max_gram": 10,
              "token_chars": [
                "letter"
              ]
            }
          }
        }
      },
    "mappings": {
      type_name: {
        "properties": {
          "author_name": text_mapping,
          "board": text_mapping,
          "content": n_gram,
          "floor": {
            "type": "long"
          },
          "location": text_mapping,
          "member_id": {
            "type": "long"
          },
          "member_type": text_mapping,
          "page": {
            "type": "long"
          },
          "post_date": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm"
                },
          "post_url": text_mapping,
          "tags": n_gram,
          "title": n_gram,
          "title_id": {
            "type": "long"
          },
          "title_url": text_mapping
        }
      }
    }
   }
    return _index_mappings

In [2]:
import traceback
from pymongo import MongoClient
from elasticsearch import Elasticsearch

In [3]:
db = MongoClient('mongodb://127.0.0.1:27017')['forum']

In [4]:
collection = db.es_threads

In [6]:
es = Elasticsearch()

In [11]:
if es.indices.exists(index='thread') is not True:
    es.indices.create(index='thread', body=index_map('threads'))

In [12]:
cursor = collection.find({}, projection={'_id':False})

In [13]:
docs = [x for x in cursor]
len(docs)

451003

In [14]:
processed = 0
for _doc in docs:
    try:
        es.index(index='thread', doc_type='threads', body=_doc)
        processed += 1
    except:
        traceback.print_exc()
print('Processed: ' + str(processed))

Processed: 451003


In [None]:
print(_doc)