References:

https://github.com/Cledge-org/cledge/blob/dev/features/college_search_tool/assets/

https://github.com/Cledge-org/cledge/blob/dev/features/college_search_tool/src/college_search_index_builder.ipynb

In [1]:
from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql.functions as F
import json


In [2]:
# Azure Imports
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    CorsOptions,
    SearchIndex,
    SearchFieldDataType,
    SimpleField,
    SearchableField
)

In [3]:
# Initialize SparkContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [4]:
# Boilerplate code to connect to Azure Search Service
service_name = "college-search-service"
admin_key = "B6C37FA89C0A5BDD60267B82A1DC1BEF"

index_name = "college-search-index"

# Create an SDK client
endpoint = "https://{}.search.windows.net/".format(service_name)
admin_client = SearchIndexClient(endpoint=endpoint,
                    index_name=index_name,
                    credential=AzureKeyCredential(admin_key))

search_client = SearchClient(endpoint=endpoint,
                    index_name=index_name,
                    credential=AzureKeyCredential(admin_key))

In [6]:
# Remove previous index if it exists: only run if you want to replace all the data
try:
    result = admin_client.delete_index(index_name)
    print('Index {} deleted'.format(index_name))
except Exception as ex:
    print(ex)

Index college-search-index deleted


In [5]:
# import dataset
df_filename = './data/college-search-data-v2.parquet'
df = spark.read.load(df_filename)

In [7]:
# build index

with open('./assets/datatypes.json') as f:
    datatypes = json.load(f)

search_fields = []
for column in df.columns:
    datatype = datatypes[column]
    if column == "UNITID":
        field = SimpleField(name=column, type=SearchFieldDataType.String, key=True)
        search_fields.append(field)
        continue
    if datatype == "string":
        field = SearchableField(name=column, type=SearchFieldDataType.String, searchable=True, sortable=True, filterable=True, facetable=True)
    elif datatype == "integer":
        field = SimpleField(name=column, type=SearchFieldDataType.Int64, filterable=True, sortable=True, facetable=True)
    elif datatype == "float":
        field = SimpleField(name=column, type=SearchFieldDataType.Double, filterable=True, sortable=True, facetable=True)
    else: # datatype == "boolean"
        field = SimpleField(name=column, type=SearchFieldDataType.Boolean, filterable=True, sortable=True, facetable=True)
    search_fields.append(field)

cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []

print(len(search_fields))

551


In [8]:
index = SearchIndex(
    name=index_name,
    fields=search_fields,
    scoring_profiles=scoring_profiles,
    cors_options=cors_options)

try:
    result = admin_client.create_index(index)
    print ('Index', result.name, 'created')
except Exception as ex:
    print (ex)

Index college-search-index created


In [9]:
# Test converting row to JSON as needed for uploading to index
print(json.loads(df.toJSON().first()))

{'UNITID': '100654', 'INSTNM': 'Alabama A & M University', 'CITY': 'Normal', 'STABBR': 'AL', 'ZIP': '35762', 'ACCREDAGENCY': 'Southern Association of Colleges and Schools Commission on Colleges', 'INSTURL': 'www.aamu.edu/', 'NPCURL': 'www.aamu.edu/admissions-aid/tuition-fees/net-price-calculator.html', 'SCH_DEG': 3, 'MAIN': 1, 'NUMBRANCH': 1, 'PREDDEG': 3, 'HIGHDEG': 4, 'CONTROL': 1, 'ST_FIPS': 1, 'REGION': 5, 'LOCALE': 12, 'LATITUDE': 34.783367, 'LONGITUDE': -86.568504, 'CCBASIC': 18, 'CCUGPROF': 10, 'CCSIZSET': 14, 'HBCU': 1, 'PBI': 0, 'ANNHI': 0, 'TRIBAL': 0, 'AANAPII': 0, 'HSI': 0, 'NANTI': 0, 'MENONLY': 0, 'WOMENONLY': 0, 'ADM_RATE_ALL': 0.9175, 'SATVR25': 430.0, 'SATVR75': 510.0, 'SATMT25': 410.0, 'SATMT75': 500.0, 'SATWR25': 370.0, 'SATWR75': 457.0, 'SATVRMID': 470.0, 'SATMTMID': 455.0, 'SATWRMID': 414.0, 'ACTCM25': 15.0, 'ACTCM75': 19.0, 'ACTEN25': 14.0, 'ACTEN75': 20.0, 'ACTMT25': 15.0, 'ACTMT75': 18.0, 'ACTCMMID': 17.0, 'ACTENMID': 17.0, 'ACTMTMID': 17.0, 'SAT_AVG': 939.0, 'S

In [10]:
# Convert dataframe to list of docs
docs = df.toJSON().collect()
for i, doc in enumerate(docs):
    doc = json.loads(doc)
    doc['@search.action'] = 'upload'
    docs[i] = doc

In [11]:
# Try to upload docs
try:
    result = search_client.upload_documents(documents=docs)
    print("Upload of new document succeeded: {}".format(result[0].succeeded))
except Exception as ex:
    print (ex.message)

Upload of new document succeeded: True


In [12]:
# Try searching to see if all documents are retrievable
results =  search_client.search(search_text="*", include_total_count=True)

print ('Total Documents Matching Query:', results.get_count()) # should be 6694

Total Documents Matching Query: 6694


In [13]:
# search for all docs containing "University of Washington" in the "INSTNM" field
results = search_client.search(search_text="University of Washington", search_mode="all", search_fields=["INSTNM"])
for result in results:
    print(result["INSTNM"], result["UNITID"])

University of Mary Washington 232681
University of Phoenix-Washington 432223
University of Washington-Seattle Campus 236948
University of Washington-Bothell Campus 377555
University of Washington-Tacoma Campus 377564
University of the Potomac-Washington DC Campus 384412


In [None]:
# After a search, if a user clicks on a particular University, we can directly get the corresponding document using the key (UNITID)
# Here we get the document corresponding to UW Seattle's id from the previous search
result = search_client.get_document(key='236948')
print(result)

In [15]:
sc.stop()