References:

https://github.com/Cledge-org/cledge/blob/dev/features/college_search_tool/assets/

https://github.com/Cledge-org/cledge/blob/dev/features/college_search_tool/src/college_search_index_builder.ipynb

In [21]:
from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col
import json


In [22]:
# Azure Imports
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    CorsOptions,
    SearchIndex,
    SearchFieldDataType,
    SimpleField,
    SearchableField
)

In [23]:
# Initialize SparkContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [24]:
# Boilerplate code to connect to Azure Search Service
service_name = "college-search-tool"
admin_key = ""

index_name = "college-search-index"

# Create an SDK client
endpoint = "https://{}.search.windows.net/".format(service_name)
admin_client = SearchIndexClient(endpoint=endpoint,
                    index_name=index_name,
                    credential=AzureKeyCredential(admin_key))

search_client = SearchClient(endpoint=endpoint,
                    index_name=index_name,
                    credential=AzureKeyCredential(admin_key))

In [34]:
# Remove previous index if it exists: only run if you want to replace all the data
try:
    result = admin_client.delete_index(index_name)
    print('Index {} deleted'.format(index_name))
except Exception as ex:
    print(ex)

Index college-search-index deleted


In [35]:
# import dataset
df_filename = './data/college-search-data-v4.parquet'
df = spark.read.load(df_filename)

## Start indexing

In [36]:
# build index

with open('./assets/datatypes.json') as f:
    datatypes = json.load(f)

search_fields = []
for column in df.columns:
    datatype = datatypes[column]
    if column == "UNITID":
        field = SimpleField(name=column, type=SearchFieldDataType.String, key=True)
        search_fields.append(field)
        continue
    if datatype == "string":
        field = SearchableField(name=column, type=SearchFieldDataType.String, searchable=True, sortable=True, filterable=True, facetable=True)
    elif datatype == "integer":
        field = SimpleField(name=column, type=SearchFieldDataType.Int64, filterable=True, sortable=True, facetable=True)
    elif datatype == "float":
        field = SimpleField(name=column, type=SearchFieldDataType.Double, filterable=True, sortable=True, facetable=True)
    else: # datatype == "boolean"
        field = SimpleField(name=column, type=SearchFieldDataType.Boolean, filterable=True, sortable=True, facetable=True)
    search_fields.append(field)

cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []

print(len(search_fields))

627


In [37]:
index = SearchIndex(
    name=index_name,
    fields=search_fields,
    scoring_profiles=scoring_profiles,
    cors_options=cors_options)

try:
    result = admin_client.create_index(index)
    print ('Index', result.name, 'created')
except Exception as ex:
    print (ex)

Index college-search-index created


In [38]:
# Test converting row to JSON as needed for uploading to index
print(json.loads(df.toJSON().first()))

{'UNITID': '106908', 'INSTNM': 'Paul Mitchell the School-Little Rock', 'CITY': 'North Little Rock', 'STABBR': 'AR', 'ZIP': '72117', 'ACCREDAGENCY': 'National Accrediting Commission of Career Arts and Sciences', 'INSTURL': 'paulmitchell.edu/littlerock', 'NPCURL': 'paulmitchell.edu/littlerock/programs/cosmetology', 'SCH_DEG': 1, 'MAIN': 1, 'NUMBRANCH': 1, 'PREDDEG': 1, 'HIGHDEG': 1, 'CONTROL': 3, 'ST_FIPS': 5, 'REGION': 5, 'LOCALE': 13, 'LATITUDE': 34.789913, 'LONGITUDE': -92.2222, 'CCBASIC': -2, 'CCUGPROF': -2, 'CCSIZSET': -2, 'HBCU': 0, 'PBI': 0, 'ANNHI': 0, 'TRIBAL': 0, 'AANAPII': 0, 'HSI': 0, 'NANTI': 0, 'MENONLY': 0, 'WOMENONLY': 0, 'PCIP01': 0.0, 'PCIP03': 0.0, 'PCIP04': 0.0, 'PCIP05': 0.0, 'PCIP09': 0.0, 'PCIP10': 0.0, 'PCIP11': 0.0, 'PCIP12': 1.0, 'PCIP13': 0.0, 'PCIP14': 0.0, 'PCIP15': 0.0, 'PCIP16': 0.0, 'PCIP19': 0.0, 'PCIP22': 0.0, 'PCIP23': 0.0, 'PCIP24': 0.0, 'PCIP25': 0.0, 'PCIP26': 0.0, 'PCIP27': 0.0, 'PCIP29': 0.0, 'PCIP30': 0.0, 'PCIP31': 0.0, 'PCIP38': 0.0, 'PCIP39': 0

In [40]:
# Convert dataframe to list of docs
docs = df.toJSON().collect()
for i, doc in enumerate(docs):
    doc = json.loads(doc)
    doc['@search.action'] = 'upload'
    docs[i] = doc

In [41]:
# Try to upload docs
try:
    result = search_client.upload_documents(documents=docs)
    print("Upload of new document succeeded: {}".format(result[0].succeeded))
except Exception as ex:
    print (ex.message)

Upload of new document succeeded: True


In [42]:
# Try searching to see if all documents are retrievable
results =  search_client.search(search_text="*", include_total_count=True)

print ('Total Documents Matching Query:', results.get_count()) # should be 6694

Total Documents Matching Query: 6694


In [43]:
# search for all docs containing "University of Washington" in the "INSTNM" field
results = search_client.search(search_text="University of Washington", search_mode="all", search_fields=["INSTNM"])
for result in results:
    print(result["INSTNM"], result["UNITID"])

University of Mary Washington 232681
University of Phoenix-Washington 432223
University of Washington-Seattle Campus 236948
University of Washington-Tacoma Campus 377564
University of Washington-Bothell Campus 377555
University of the Potomac-Washington DC Campus 384412


In [44]:
# After a search, if a user clicks on a particular University, we can directly get the corresponding document using the key (UNITID)
# Here we get the document corresponding to UW Seattle's id from the previous search
result = search_client.get_document(key='236948')
print(result)

{'UNITID': '236948', 'INSTNM': 'University of Washington-Seattle Campus', 'CITY': 'Seattle', 'STABBR': 'WA', 'ZIP': '98195-4550', 'ACCREDAGENCY': 'Northwest Commission on Colleges and Universities', 'INSTURL': 'www.washington.edu/', 'NPCURL': 'www.washington.edu/students/osfa/prospectiveug/aid.est.1.html', 'SCH_DEG': 3, 'MAIN': 1, 'NUMBRANCH': 3, 'PREDDEG': 3, 'HIGHDEG': 4, 'CONTROL': 1, 'ST_FIPS': 53, 'REGION': 8, 'LOCALE': 11, 'LATITUDE': 47.65538, 'LONGITUDE': -122.30514, 'CCBASIC': 15, 'CCUGPROF': 14, 'CCSIZSET': 16, 'HBCU': 0, 'PBI': 0, 'ANNHI': 0, 'TRIBAL': 0, 'AANAPII': 0, 'HSI': 0, 'NANTI': 0, 'MENONLY': 0, 'WOMENONLY': 0, 'RELAFFIL': None, 'ADM_RATE_ALL': 0.5498, 'SATVR25': 600.0, 'SATVR75': 700.0, 'SATMT25': 620.0, 'SATMT75': 770.0, 'SATWR25': 530.0, 'SATWR75': 650.0, 'SATVRMID': 650.0, 'SATMTMID': 695.0, 'SATWRMID': 590.0, 'ACTCM25': 27.0, 'ACTCM75': 33.0, 'ACTEN25': 25.0, 'ACTEN75': 35.0, 'ACTMT25': 26.0, 'ACTMT75': 33.0, 'ACTWR25': 8.0, 'ACTWR75': 9.0, 'ACTCMMID': 30.0, 'A

In [45]:
sc.stop()