In [1]:
import traceback
from pymongo import MongoClient
from elasticsearch import Elasticsearch

In [2]:
db = MongoClient('mongodb://127.0.0.1:27017')['Dogforum']

In [3]:
es = Elasticsearch()

In [4]:
_index_mappings = {
  "mappings": {
    "Breeds": {
      "properties": {
        "author_info": {
          "properties": {
            "join_date": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            },
            "location": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            },
            "member_type": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            },
            "mentioned": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            },
            "posts": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            },
            "tagged": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            }
          }
        },
        "author_name": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "author_url": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "content": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "floor": {
          "type": "long"
        },
        "_id": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "page": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "post_date": {
          "properties": {
            "ampm": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            },
            "date": {
              "type": "date",
          "format": "MM-dd-yyyy||strict_year_month_day",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            },
            "time": {
              "type": "text",
              "fields": {
                "keyword": {
                  "type": "keyword",
                  "ignore_above": 256
                }
              }
            }
          }
        },
        "title": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "url": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        }
      }
    }
  }
}

In [6]:
if es.indices.exists(index='dogforum') is not True:
    es.indices.create(index='dogforum', body=_index_mappings)

In [20]:
breed_cursor = db.Breeds.find({}, projection={'_id':False})

In [21]:
breed_docs = [x for x in breed_cursor]

In [22]:
breed_docs[1].get('id').replace('/','')

'Page1Floor1dog-breedscan-you-help-identify-my-dogs-350978'

In [23]:
for _doc in breed_docs[:10]:
    print(_doc.get('id').replace('/',''))

Page1Floor1dog-breedstruth-about-designer-dogs-11809
Page1Floor1dog-breedscan-you-help-identify-my-dogs-350978
Page1Floor2dog-breedscan-you-help-identify-my-dogs-350978
Page1Floor3dog-breedscan-you-help-identify-my-dogs-350978
Page1Floor1dog-breedsinfo-help-breed-guesses-basic-dog-1424
Page1Floor2dog-breedsinfo-help-breed-guesses-basic-dog-1424
Page1Floor3dog-breedsinfo-help-breed-guesses-basic-dog-1424
Page1Floor4dog-breedsinfo-help-breed-guesses-basic-dog-1424
Page1Floor5dog-breedsinfo-help-breed-guesses-basic-dog-1424
Page1Floor6dog-breedsinfo-help-breed-guesses-basic-dog-1424


In [25]:
processed = 0

In [27]:
for _doc in breed_docs[:10]:
    try:
        es.index(index='dogforum', doc_type='Breeds', id=_doc.get('id').replace('/',''), body=_doc)
        processed += 1
        print('Processed: ' + str(processed), flush=True)
    except:
        traceback.print_exc()

Processed: 101
Processed: 102
Processed: 103
Processed: 104
Processed: 105
Processed: 106
Processed: 107
Processed: 108
Processed: 109
Processed: 110
