## Building a Custom Search Engine
### Step 2 - Create Azure Search Index
- Define new index structure
- Create Azure Search index
- Upload and index parsed content from step 1
- Optional: Simple management of Azure Search index

In [None]:
# Import base packages
import requests
import json
import csv
import datetime
import pytz
import calendar
import os
import pyexcel as pe

First, initialize Azure Search configuration parameters to be used for index creation

In [2]:
# This is the service you've already created in Azure Portal
serviceName = 'your_azure_search_service_name'

# Index to be created
indexName = 'name_of_index_to_create'

# Set your service API key, either via an environment variable or enter it below
#apiKey = os.getenv('SEARCH_KEY_DEV', '')
apiKey = 'your_azure_search_service_api_key'
apiVersion = '2016-09-01'

Set the path to the parsed content file from step 1, and define a basic mapping of the input fields to the desired target field names in the new index. Input and output field names do not need to be the same. However, the target names should match the index definition in getIndexDefinition().

In [None]:
# Input parsed content Excel file, e.g., output of step #1 in
# https://github.com/CatalystCode/CustomSearch/tree/master/JupyterNotebooks/1-content_extraction.ipynb
inputfile = os.path.join(os.getcwd(), '../sample/parsed_content.xlsx')

# Define fields mapping from Excel file column names to search index field names (except Index)
# Change this mapping to match your content fields and rename output fields as desired
# Search field names should match their definition in getIndexDefinition()
fields_map = [ ('File'            , 'File'),
               ('ChapterTitle'    , 'ChapterTitle'),
               ('SectionTitle'    , 'SectionTitle'),
               ('SubsectionTitle' , 'SubsectionTitle'),
               ('SubsectionText'  , 'SubsectionText'),
               ('Keywords'        , 'Keywords') ]

Now, let's define the structure of the new index to be created. In this example, all titles, content text and keywords fields are full-text searchable. Queries will use all searchable fields by default to retrieve a ranked list of results.

For more details, refer to [Create an Azure Search Index](https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index).

In [None]:
# Fields: Index	File	ChapterTitle	SectionTitle	SubsectionTitle		SubsectionText	Keywords
def getIndexDefinition():
    return {
        "name": indexName,  
        "fields": [
        {"name": "Index", "type": "Edm.String", "key": True, "retrievable": True, "searchable": False, "filterable": False, "sortable": True, "facetable": False},

        {"name": "File", "type": "Edm.String", "retrievable": True, "searchable": False, "filterable": True, "sortable": True, "facetable": False},

        {"name": "ChapterTitle", "type": "Edm.String", "retrievable": True, "searchable": True, "filterable": True, "sortable": True, "facetable": True},

        {"name": "SectionTitle", "type": "Edm.String", "retrievable": True, "searchable": True, "filterable": True, "sortable": False, "facetable": True},

        {"name": "SubsectionTitle", "type": "Edm.String", "retrievable": True, "searchable": True, "filterable": True, "sortable": True, "facetable": False},

        {"name": "SubsectionText", "type": "Edm.String", "retrievable": True, "searchable": True, "filterable": False, "sortable": False, "facetable": False, "analyzer": "en.microsoft"},

        {"name": "Keywords", "type": "Edm.String", "retrievable": True, "searchable": True, "filterable": False, "sortable": False, "facetable": False, "analyzer": "en.microsoft"}
        ]
    }

#### Helper functions for basic REST API operations

In [None]:
def getServiceUrl():
    return 'https://' + serviceName + '.search.windows.net'

def getMethod(servicePath):
    headers = {'Content-type': 'application/json', 'api-key': apiKey}
    r = requests.get(getServiceUrl() + servicePath, headers=headers)
    #print(r.text)
    return r

def postMethod(servicePath, body):
    headers = {'Content-type': 'application/json', 'api-key': apiKey}
    r = requests.post(getServiceUrl() + servicePath, headers=headers, data=body)
    #print(r, r.text)
    return r

#### Simple index management functions
- Create a new index
- Delete an existing index
- Check if index exists

In [None]:
def createIndex():
    indexDefinition = json.dumps(getIndexDefinition())  
    servicePath = '/indexes/?api-version=%s' % apiVersion
    r = postMethod(servicePath, indexDefinition)
    #print r.text
    if r.status_code == 201:
       print('Index %s created' % indexName)   
    else:
       print('Failed to create index %s' % indexName)
       exit(1)

def deleteIndex():
    servicePath = '/indexes/%s?api-version=%s&delete' % (indexName, apiVersion)
    headers = {'Content-type': 'application/json', 'api-key': apiKey}
    r = requests.delete(getServiceUrl() + servicePath, headers=headers)
    #print(r.text)

def getIndex():
    servicePath = '/indexes/%s?api-version=%s' % (indexName, apiVersion)
    r = getMethod(servicePath)
    if r.status_code == 200:  
       return True
    else:
       return False

#### Helper functions to fetch one or more documents from the parsed content file

Note: In this exercise, a *document* corresponds to one row from the parsed content Excel file.

In [None]:
def getDocumentObject():   
    valarry = []
    cnt = 1
    records = pe.iget_records(file_name=inputfile)
    for row in records:
        outdict = {}
        outdict['@search.action'] = 'upload'

        if (row[fields_map[0][0]]):
            outdict['Index'] = str(row['Index'])
            for (in_fld, out_fld) in fields_map:
                outdict[out_fld]  = row[in_fld]
        valarry.append(outdict)
        cnt+=1

    return {'value' : valarry}

def getDocumentObjectByChunk(start, end):   
    valarry = []
    cnt = 1
    records = pe.iget_records(file_name=inputfile)
    for i, row in enumerate(records):
        if start <= i < end:
            outdict = {}
            outdict['@search.action'] = 'upload'

            if (row[fields_map[0][0]]):
                outdict['Index'] = str(row['Index'])
                for (in_fld, out_fld) in fields_map:
                    outdict[out_fld]  = row[in_fld]
            valarry.append(outdict)
            cnt+=1

    return {'value' : valarry}

#### Main functions to upload and index documents in Azure Search

Three methods are provided:
- Upload all documents (rows) at once
- Upload documents in chunks
- Upload one document at a time

**Note:** The method choice depends on the content size and whether it would fit in one or more REST request. 

In [None]:
# Upload content for indexing in one request if content is not too large
def uploadDocuments():
    documents = json.dumps(getDocumentObject())
    servicePath = '/indexes/' + indexName + '/docs/index?api-version=' + apiVersion
    r = postMethod(servicePath, documents)
    if r.status_code == 200:
        print('Success: %s' % r)   
    else:
        print('Failure: %s' % r.text)
        exit(1)

# Upload content for indexing in chunks if content is too large for one request
def uploadDocumentsInChunks(chunksize):
    records = pe.iget_records(file_name=inputfile)
    cnt  = 0
    for row in records:
        cnt += 1

    for chunk in range(cnt/chunksize + 1):
        print('Processing chunk number %d ...' % chunk)
        start = chunk * chunksize
        end   = start + chunksize
        documents = json.dumps(getDocumentObjectByChunk(start, end))
        servicePath = '/indexes/' + indexName + '/docs/index?api-version=' + apiVersion
        r = postMethod(servicePath, documents)
        if r.status_code == 200:
            print('Success: %s' % r)   
        else:
            print('Failure: %s' % r.text)
    return

# Upload content for indexing one document at a time
def uploadDocumentsOneByOne():
    records = pe.iget_records(file_name=inputfile)
    valarry = []
    for i, row in enumerate(records):
        outdict = {}
        outdict['@search.action'] = 'upload'

        if (row[fields_map[0][0]]):
            outdict['Index'] = str(row['Index'])
            for (in_fld, out_fld) in fields_map:
                outdict[out_fld]  = row[in_fld]
            valarry.append(outdict)

        documents = json.dumps({'value' : valarry})
        servicePath = '/indexes/' + indexName + '/docs/index?api-version=' + apiVersion
        r = postMethod(servicePath, documents)
        if r.status_code == 200:
            print('%d Success: %s' % (i,r))   
        else:
            print('%d Failure: %s' % (i, r.text))
            exit(1)

#### Helper functions to check and query an index

In [None]:
def printDocumentCount():
    servicePath = '/indexes/' + indexName + '/docs/$count?api-version=' + apiVersion   
    getMethod(servicePath)

def sampleQuery(query, ntop=3):
    servicePath = '/indexes/' + indexName + '/docs?api-version=%s&search=%s&$top=%d' % \
        (apiVersion, query, ntop)
    getMethod(servicePath)

### Create index and upload all parsed content

Now let's create the index, or delete and re-create the index if it exists, then upload all parsed documents in chunks. The small sample can be uploaded all at once, but the full tax code content would require multiple requests.

In [None]:
# Choose upload method to be used. Options: 'all', chunks' or 'one'
upload_method     = 'chunks'
upload_chunk_size = 50

# Create index if it does not exist
if not getIndex():
    createIndex()    
else:
    ans = raw_input('Index %s already exists ... Do you want to delete it? [Y/n]' % indexName)
    if ans.lower() == 'y':
        deleteIndex()
        print('Re-creating index %s ...' % indexName)
        createIndex()
    else:
        print('Index %s is not deleted ... New content will be added to existing index' % indexName)

if upload_method == 'all':
    uploadDocuments()
elif upload_methos == 'chunks':
    uploadDocumentsInChunks(upload_chunk_size)
else:
    uploadDocumentsOneByOne()
    
# Verify and test the newly created index
printDocumentCount()
sampleQuery('child tax credit')

#### The content is now ready for interactive or batch queries, as demonstrated in step #3.