In [13]:
# Install the 'elasticsearch' Python package
# This library allows Python to interact with an Elasticsearch cluster
!pip install elasticsearch



In [3]:
# Start Docker containers in detached mode using the specified docker-compose.yml file
!docker compose -f '/Users/daniel/Documents/Northwestern/MSDS 420 - Database Systems/Assignment_6/docker-compose.yml' up -d

[1A[1B[0G[?25l[+] Running 0/1
 [33m⠋[0m elasticsearch Pulling                                                   [34m0.1s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 [33m⠙[0m elasticsearch Pulling                                                   [34m0.2s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 [33m⠹[0m elasticsearch Pulling                                                   [34m0.3s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 [33m⠸[0m elasticsearch Pulling                                                   [34m0.4s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 [33m⠼[0m elasticsearch Pulling                                                   [34m0.5s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 [33m⠴[0m elasticsearch Pulling                                                   [34m0.6s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 [33m⠦[0m elasticsearch Pulling                                                   [34m0.7s [0m
[?25h[1A[1A[0G[?25l[+] Runni

In [11]:
# Import the pandas library and alias it as 'pd'
# Pandas is used for data manipulation and analysis, especially working with tabular data structures like DataFrames
import pandas as pd

In [7]:
from elasticsearch import Elasticsearch, helpers
import traceback

try:
    # Create an Elasticsearch client (no certs because xpack security is disabled)
    es = Elasticsearch("http://localhost:9200", verify_certs=False)

    # Test connection
    ping_result = es.ping()

    if ping_result:
        print("✅ Connected to Elasticsearch!")
    else:
        print("❌ Ping failed — connected but server didn't respond with OK.")
        
except Exception as e:
    # If there's a connection or configuration error, print details
    print("❌ Failed to connect to Elasticsearch.")
    print("Error type:", type(e).__name__)
    print("Error message:", e)
    traceback.print_exc()

✅ Connected to Elasticsearch!


In [13]:
# Define the Elasticsearch query parameters
query = {
    'size': 10000,  # Limit the number of returned documents to 10,000
    'query': {
        'bool': {
            'should': [  # At least one of these conditions must match
                {'match_phrase': {'Facility Type': 'Daycare (2 - 6 Years)'}},
                {'match_phrase': {'Facility Type': 'Daycare Above and Under 2 Years'}},
                {'match_phrase': {'Facility Type': 'CHILDRENS SERVICES FACILITY'}}
            ],
            'minimum_should_match': 1,  # Enforces that at least one 'should' clause must match
            'filter': [  # These conditions must be satisfied
                {'match': {'Results': 'Fail'}},  # The inspection result must be 'Fail'
                {'match_phrase': {'Risk': 'Risk 1 (High)'}}  # Risk level must be 'Risk 1 (High)'
            ],
            'must': [  # All 'must' conditions must match
                {'match': {'Violations': 'GARBAGE'}}  # The word 'GARBAGE' must appear in the Violations field
            ]
        }
    }
}

# Execute the search on the 'food_inspections' index using the defined query
response = es.search(index="food_inspections", body=query)

# Extract the '_source' field from each search hit (contains the actual document data)
hits = [hit['_source'] for hit in response['hits']['hits']]

# Convert the extracted records into a pandas DataFrame for easy analysis
df = pd.DataFrame(hits)

# Display the resulting DataFrame
df

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,1948254,CENTRO INFANTIL,CENTRO INFANTIL,2215516.0,Daycare Above and Under 2 Years,Risk 1 (High),2739 W DIVISION ST,CHICAGO,IL,60622.0,07/27/2016,Canvass,Fail,19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE A...,41.902822,-87.695990,"(41.9028223612357, -87.69599039701163)"
1,1609799,HIGH RIDGE YMCA,HIGH RIDGE YMCA,2215705.0,Daycare Above and Under 2 Years,Risk 1 (High),2424 W TOUHY AVE,CHICAGO,IL,60645.0,01/15/2016,Canvass,Fail,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",42.012369,-87.691259,"(42.01236867057877, -87.6912594351893)"
2,545745,"V & J DAY CARE CENTER, INC","V & J DAY CARE CENTER, INC",21702.0,Daycare Above and Under 2 Years,Risk 1 (High),1 E 113TH ST,CHICAGO,IL,60628.0,01/13/2012,Canvass,Fail,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.688848,-87.622894,"(41.688848119400404, -87.6228936894908)"
3,545746,"V & J DAY CARE CENTER, INC","V & J DAY CARE CENTER, INC",1800876.0,Daycare Above and Under 2 Years,Risk 1 (High),1 E 113TH ST,CHICAGO,IL,60628.0,01/13/2012,Canvass,Fail,19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE A...,41.688848,-87.622894,"(41.688848119400404, -87.6228936894908)"
4,2059748,PAULO FREIRE CENTER,PAULO FREIRE CENTER,2215569.0,Daycare Above and Under 2 Years,Risk 1 (High),1653 W 43RD ST,CHICAGO,IL,60609.0,06/07/2017,License,Fail,19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE A...,41.815742,-87.667348,"(41.81574158020701, -87.66734779620032)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,1386187,BUSY BUMBLE BEE ACADEMY DAYCARE,BUSY BUMBLE BEE ACADEMY DAYCARE,2215472.0,Daycare (2 - 6 Years),Risk 1 (High),6450 S COTTAGE GROVE AVE,CHICAGO,IL,60637.0,06/08/2015,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.777092,-87.606004,"(41.777092394888655, -87.60600377956905)"
72,335234,MC CANN DAY CARE CENTER,MC CANN DAY CARE CENTER,6527.0,Daycare (2 - 6 Years),Risk 1 (High),8612 S STONY ISLAND AVE,CHICAGO,IL,60617.0,08/09/2010,License Re-Inspection,Fail,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",41.738301,-87.585818,"(41.738300894823446, -87.58581841641303)"
73,233886,"FIRST STEP LEARNING DAYCARE, INCORPORATED","FIRST STEP LEARNING DAYCARE, INCORPORATED",1991985.0,Daycare (2 - 6 Years),Risk 1 (High),6401 S ASHLAND AVE,CHICAGO,IL,60636.0,05/17/2010,License Re-Inspection,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.777567,-87.664021,"(41.77756686461297, -87.66402057353883)"
74,555222,PARKWAY COMMUNITY HOUSE,PARKWAY COMMUNITY HOUSE,27807.0,Daycare (2 - 6 Years),Risk 1 (High),500 E 67TH ST,CHICAGO,IL,60637.0,07/22/2011,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,41.773123,-87.612988,"(41.773123195333895, -87.61298770791186)"


In [15]:
# Define a query_string query to search for the word "Children"
# across multiple text fields using Elasticsearch's built-in parser
query = {
    "query": {
        "query_string": {
            "query": "Children",  # The search term (case-insensitive)
            "fields": ["Facility Type", "Violations", "DBA Name"]  # Fields to search
        }
    }
}

# Execute the query on the 'food_inspections' index
response = es.search(index="food_inspections", body=query)

# Extract document sources from the response
hits = [hit['_source'] for hit in response['hits']['hits']]

# Convert the results to a pandas DataFrame
df = pd.DataFrame(hits)

# Display the DataFrame
df

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,1931574,JOLLY TROLLEY ACADEMY INC.,JOLLY TROLLEY ACADEMY INC.,2379063.0,Daycare Above and Under 2 Years,Risk 1 (High),9522-9524 S VINCENNES AVE,CHICAGO,IL,60643.0,05/25/2016,Canvass,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.720777,-87.650996,"(41.72077653099484, -87.6509957122514)"
1,2144717,"IMANI CHILDRENS ACADEMY, INC.","IMANI CHILDRENS ACADEMY, INC.",2215975.0,Daycare Above and Under 2 Years,Risk 1 (High),11443 S HALSTED ST,CHICAGO,IL,60628.0,02/08/2018,License,Pass,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",41.685628,-87.642023,"(41.68562832361589, -87.64202316137356)"
2,1932396,"LITTLE HANDS LEARNING CENTER ACADEMY, INC","LITTLE HANDS LEARNING CENTER ACADEMY, INC",2216116.0,1023 CHILDERN'S SERVICES FACILITY,Risk 1 (High),10126 S WESTERN AVE,CHICAGO,IL,60643.0,06/09/2016,Canvass,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.709217,-87.681854,"(41.70921709772374, -87.68185356852545)"
3,1946700,"TEENCHY WEENCHY DAYCARE & LEARNING CENTER INC,",TEENCHY WEENCHY DAYCARE,2216039.0,Daycare Above and Under 2 Years,Risk 1 (High),901 E 104TH ST,CHICAGO,IL,60628.0,07/18/2016,License,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.705567,-87.601074,"(41.70556659418179, -87.60107403032647)"
4,531385,THE WOODLAWN ORGANIZATION,THE WOODLAWN ORGANIZATION,1718117.0,Daycare (2 - 6 Years),Risk 1 (High),950 E 61ST ST,CHICAGO,IL,60637.0,05/26/2011,License,Fail,"26. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, P...",41.784209,-87.602383,"(41.78420897848061, -87.60238292698212)"
5,531384,THE WOODLAWN ORGANIZATION,THE WOODLAWN ORGANIZATION,5663.0,Daycare (2 - 6 Years),Risk 1 (High),950 E 61ST ST,CHICAGO,IL,60637.0,05/26/2011,License,Fail,"26. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, P...",41.784209,-87.602383,"(41.78420897848061, -87.60238292698212)"
6,1360343,FAIRYLAND NURSERY SCHOOL INC,FAIRYLAND NURSERY SCHOOL,1771126.0,Daycare (2 - 6 Years),Risk 1 (High),4350 N MILWAUKEE AVE,CHICAGO,IL,60641.0,08/19/2013,Canvass,Pass,"41. PREMISES MAINTAINED FREE OF LITTER, UNNECE...",41.960051,-87.75415,"(41.96005114701864, -87.75415039630879)"
7,1932125,JUDAH FOUNDATION COMMUNITY SCHOOL,JUDAH INTERNATIONAL OUTREACH MINISTRIES,2278892.0,Daycare (2 - 6 Years),Risk 1 (High),856 N PULASKI RD,CHICAGO,IL,60651.0,06/03/2016,Recent Inspection,Pass,38. VENTILATION: ROOMS AND EQUIPMENT VENTED AS...,41.897049,-87.726282,"(41.897049354437264, -87.72628204229734)"
8,1496743,ALL ABOUT KIDS LEARNING ACADEMY,ALL ABOUT KIDS LEARNING ACADEMY,2215832.0,Daycare Combo 1586,Risk 1 (High),512-514 E 75TH ST,CHICAGO,IL,60619.0,09/16/2014,License,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.758572,-87.612116,"(41.75857224813092, -87.61211561154835)"
9,1527414,THE CHALKBOARD INC,THE CHALKBOARD,2215547.0,Daycare (2 - 6 Years),Risk 1 (High),450 W MENOMONEE ST,CHICAGO,IL,60614.0,03/03/2015,License Re-Inspection,Pass,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",41.914777,-87.640667,"(41.91477744637897, -87.64066696188081)"


In [17]:
# Define a query using 'query_string' with a wildcard to match "Children" in text fields
query = {
    'size': 10000,  # Return up to 10,000 documents
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},  # Filter to only failed inspections
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},  # Filter to only high-risk facilities
                {
                    "query_string": {
                        "query": "*Children*",  # Wildcard search to simulate regex-like matching
                        "fields": ["Facility Type", "Violations", "DBA Name"]  # Fields to search in
                    }
                }
            ]
        }
    }
}

# Execute the search and keep the scroll context open for 1 hour
results = es.search(index='food_inspections', body=query, scroll='1h')

# Extract matching documents
hits = [hit['_source'] for hit in results['hits']['hits']]

# Convert to DataFrame
df_exp1 = pd.DataFrame(hits)

# Display the first few results
df_exp1.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2144534,5 STAR CHILDCARE DEVELOPMENT CENTER INC.,5 STAR CHILDCARE DEVELOPMENT CENTER,2575240.0,Children's Services Facility,Risk 1 (High),2811 W FIFTH AVE,CHICAGO,IL,60612.0,02/06/2018,License,Fail,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",41.880741,-87.697071,"(41.88074107893008, -87.69707112081443)"
1,2144519,EZZARD CHARLES DAYCARE CENTER,EZZARD CHARLES DAYCARE CENTER,2215585.0,Daycare (2 - 6 Years),Risk 1 (High),7946 S ASHLAND AVE,CHICAGO,IL,60620.0,02/06/2018,Complaint Re-Inspection,Fail,"14. PREVIOUS SERIOUS VIOLATION CORRECTED, 7-42...",41.748958,-87.663555,"(41.74895804211779, -87.66355545886083)"
2,2144366,EZZARD CHARLES DAYCARE CENTER,EZZARD CHARLES DAYCARE CENTER,2215585.0,Daycare (2 - 6 Years),Risk 1 (High),7946 S ASHLAND AVE,CHICAGO,IL,60620.0,02/01/2018,Complaint Re-Inspection,Fail,"14. PREVIOUS SERIOUS VIOLATION CORRECTED, 7-42...",41.748958,-87.663555,"(41.74895804211779, -87.66355545886083)"
3,2136047,KIDZ CREATIVE CORNER,KIDZ CREATIVE CORNER,2327452.0,Children's Services Facility,Risk 1 (High),3811 N LINCOLN AVE,CHICAGO,IL,60613.0,01/26/2018,Canvass,Fail,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",41.950929,-87.676346,"(41.95092894222622, -87.67634617680221)"
4,2135927,THE NEXT GENERATION DAYCARE,THE NEXT GENERATION DAYCARE,2240995.0,Children's Services Facility,Risk 1 (High),21-23 E 59TH ST,CHICAGO,IL,60637.0,01/24/2018,License,Fail,22. DISH MACHINES: PROVIDED WITH ACCURATE THER...,41.787269,-87.624622,"(41.78726854392145, -87.62462230048358)"


In [25]:
# Define a query using a wildcard in query_string to simulate a regular expression
query = {
    'size': 10000,  # Max number of hits per scroll page (Elasticsearch default scroll limit)
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},  # Only include failed inspections
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},  # Only include high-risk results
                {
                    "query_string": {
                        "query": "*Children*",  # Simulate regex-style matching of any text containing 'Children'
                        "fields": ["Facility Type", "Violations", "DBA Name"]  # Fields to search in
                    }
                }
            ]
        }
    }
}

# Execute the initial search using scroll API to support retrieving large result sets
results = es.search(index='food_inspections', body=query, scroll='1h')

# Retrieve the scroll ID (used to fetch the next batch)
sid = results['_scroll_id']

# Get total number of documents that matched
scroll_size = results['hits']['total']
print('sid =', sid)
print('Scroll Size =', scroll_size)

# Extract first batch of results
hits = [hit['_source'] for hit in results['hits']['hits']]

sid = FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFlN6Vmgtc3B3VEo2NVc4d0hjNndMY2cAAAAAAAAACBZFekFQU1p3dVJkV0hTV1luX1BoRFpB
Scroll Size = {'value': 601, 'relation': 'eq'}


In [27]:
# --------------------------------------------
# NOTE:
# This block does NOT re-run the query.
# Instead, it retrieves metadata from the 'results' object:
# - The '_scroll_id' is used for paginating large result sets.
# - 'scroll_size' tells us how many total documents matched the query.
# This step is important for controlling scroll-based retrieval,
# especially when results exceed the initial batch (e.g., 10,000).
# --------------------------------------------

# Retrieve the '_scroll_id' from the initial search results.
# The '_scroll_id' is used to keep track of the scroll position for subsequent scroll requests.
sid = results['_scroll_id']

# Get the total number of hits (documents) returned by the query.
# This represents the total number of documents that match the query criteria.
scroll_size = results['hits']['total']

# Printing the '_scroll_id' and the total number of hits.
# These prints are useful for debugging or understanding the scope of your query results.
print('sid = ', sid)
print('Scroll Size = ', scroll_size)

sid =  FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFlN6Vmgtc3B3VEo2NVc4d0hjNndMY2cAAAAAAAAACBZFekFQU1p3dVJkV0hTV1luX1BoRFpB
Scroll Size =  {'value': 601, 'relation': 'eq'}


In [29]:
type(results) # results[]

elastic_transport.ObjectApiResponse

In [31]:
results.keys() # The data itself is in the list: results['hits']['hits'] 

dict_keys(['_scroll_id', 'took', 'timed_out', '_shards', 'hits'])

In [33]:
print(f"The query took {results['took']} milliseconds to execute.")

The query took 72 milliseconds to execute.


In [35]:
results['hits']['hits']  # the value of the "_source" key is the content of the document

[{'_index': 'food_inspections',
  '_id': '2144534',
  '_score': 3.573621,
  '_ignored': ['Violations.keyword'],
  '_source': {'Inspection ID': 2144534,
   'DBA Name': '5 STAR CHILDCARE DEVELOPMENT CENTER INC.',
   'AKA Name': '5 STAR CHILDCARE DEVELOPMENT CENTER',
   'License #': 2575240.0,
   'Facility Type': "Children's Services Facility",
   'Risk': 'Risk 1 (High)',
   'Address': '2811 W FIFTH AVE ',
   'City': 'CHICAGO',
   'State': 'IL',
   'Zip': 60612.0,
   'Inspection Date': '02/06/2018',
   'Inspection Type': 'License',
   'Results': 'Fail',
   'Violations': "11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, DESIGNED, AND MAINTAINED - Comments: NO EXPOSED HANDWASHING SINK PROVIDED IN THE FOOD PREP AREA.  MANAGEMENT INSTRUCTED TO INSTALL AN EXPOSED HANDWASHING SINK IN THE FOOD PREP AREA AND SUPPLY IT WITH SOAP AND PAPER TOWELS AT ALL TIMES.\nCRITICAL VIOLATION 7-38-030. | 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED A

In [37]:
scroll_size = len(results['hits']['hits'])  
print(scroll_size)

601


In [39]:
# Initialize a counter to track the number of processed documents.
count = 0

# Initialize an empty list to store pairs of latitude and longitude.
list_of_lAT_LONG_pairs = []

# Start a while loop that continues as long as there are search results to process.
while(scroll_size > 0):

    # Iterate through each search result in the 'hits' field.
    for inspection in results['hits']['hits']:
        # Initialize an empty list to store the current document's latitude and longitude.
        current_location_lAT_LONG = []
        # Extract the source field from the current search result, which contains the document data.
        document = inspection['_source']
        
        # Increment the counter for each document processed.
        count = count + 1
        
        # Check if the required fields 'Latitude', 'Longitude', and 'Address' are present in the document.
        if 'Latitude' in document.keys() and 'Longitude' in document.keys() and 'Address' in document.keys():
            # Ensure that the latitude, longitude, and address fields are not None.
            if(document['Latitude'] != None and document['Longitude'] != None and document['Address'] != None):
                # Convert latitude and longitude to float and append them as a pair to the list.
                current_location_lAT_LONG.append(float(document['Latitude']))    
                current_location_lAT_LONG.append(float(document['Longitude']))
                list_of_lAT_LONG_pairs.append(current_location_lAT_LONG)
                        
    # Retrieve the next batch of results using the Elasticsearch 'scroll' feature.
    results = es.scroll(scroll_id = sid, scroll = '2m')    
    # Update the scroll ID for the next iteration.
    sid = results['_scroll_id']
    # Update the size of the scroll to the number of hits in the current batch.
    scroll_size = len(results['hits']['hits'])  
    
# Print the total number of documents processed.
print("the total number of match with children using wild card:", count)


the total number of match with children using wild card: 601


In [41]:
document.keys()

dict_keys(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type', 'Risk', 'Address', 'City', 'State', 'Zip', 'Inspection Date', 'Inspection Type', 'Results', 'Violations', 'Latitude', 'Longitude', 'Location'])

In [43]:
list_of_lAT_LONG_pairs[:3] # The first 3 coordinates on the list.

[[41.8807410789, -87.6970711208],
 [41.7489580421, -87.6635554589],
 [41.7489580421, -87.6635554589]]

In [45]:
len(list_of_lAT_LONG_pairs) # There are 601 coordinates altogether.

601

In [47]:
# Install the 'folium' package using pip in the current Jupyter kernel.
# Note: This cell only needs to be executed once. After installation, you can comment it out or delete it to prevent repeated installations in future runs.
!pip install folium



In [49]:
import folium
from folium import plugins

print(folium.__version__)

0.19.5


In [51]:
chicago_map = folium.Map([41.90293279, -87.70769386], zoom_start=11)
chicago_map

In [53]:
# Lets plot the query matches on Chicago HeatMap

chicago_map.add_child(plugins.HeatMap(list_of_lAT_LONG_pairs, radius=15))
chicago_map

In [55]:
# -------------------------------------------------------------
# EXPERIMENT #2: Fuzzy search using "Children's~2"
# Goal: Match documents containing variations of the term "Children's"
# with up to 2 character differences (edit distance of 2).
# This helps catch typos or slightly misspelled terms (e.g., "Childrens", "Chidren's").
# Fields searched: "Facility Type", "Violations", "DBA Name"
# -------------------------------------------------------------

query = {
    'size': 10000,  # Return up to 10,000 documents
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},  # Include only failed inspections
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},  # Only high-risk facilities
                {
                    "query_string": {
                        "query": "Children's~2",  # Use fuzziness (~2) to allow slight spelling variations
                        "fields": ["Facility Type", "Violations", "DBA Name"]
                    }
                }
            ]              
        }
    }
}

# Execute the fuzzy search using scroll API
results = es.search(index='food_inspections', body=query, scroll='1h')

In [57]:
# Retrieve the '_scroll_id' from the initial search results
# Used to continue fetching more documents using the scroll API
sid = results['_scroll_id']

# Get the total number of hits (documents) returned by the query
# This tells how many documents matched the fuzzy search
scroll_size = results['hits']['total']

In [59]:
# Print the scroll ID to track the scroll context (used for pagination)
print('sid = ', sid)

# Print the total number of documents that matched the query
print('Scroll Size = ', scroll_size)

sid =  FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFlN6Vmgtc3B3VEo2NVc4d0hjNndMY2cAAAAAAAAACRZFekFQU1p3dVJkV0hTV1luX1BoRFpB
Scroll Size =  {'value': 611, 'relation': 'eq'}


In [61]:
# Extract the actual integer count from the 'scroll_size' dictionary (common in Elasticsearch 8+)
scroll_size = scroll_size['value']

In [63]:
# Initialize a counter to track the number of processed documents
count = 0

# Initialize an empty list to store [Latitude, Longitude] pairs
list_of_lAT_LONG_pairs = []

# Start a while loop that continues as long as there are search results to process
while scroll_size > 0:

    # Iterate through each search result (inspection document)
    for inspection in results['hits']['hits']:
        # Temporary list to hold the current document's [Latitude, Longitude]
        current_location_lAT_LONG = []

        # Get the actual document content from the '_source' field
        document = inspection['_source']

        # Increment the document counter
        count += 1

        # Check if 'Latitude', 'Longitude', and 'Address' fields exist in the document
        if 'Latitude' in document and 'Longitude' in document and 'Address' in document:
            # Ensure these fields are not None
            if document['Latitude'] is not None and document['Longitude'] is not None and document['Address'] is not None:
                # Convert values to float and store as a coordinate pair
                current_location_lAT_LONG.append(float(document['Latitude']))
                current_location_lAT_LONG.append(float(document['Longitude']))
                list_of_lAT_LONG_pairs.append(current_location_lAT_LONG)

    # Fetch the next batch of documents using the scroll ID
    results = es.scroll(scroll_id=sid, scroll='2m')

    # Update the scroll ID for the next iteration
    sid = results['_scroll_id']

    # Update the scroll size to reflect the number of documents in this batch
    scroll_size = len(results['hits']['hits'])

# Print the total number of matched documents from this fuzzy search
print("The total number of matches with 'Children' using wildcard:", count)

The total number of matches with 'Children' using wildcard: 611


In [65]:
# -------------------------------------------------------------
# EXPERIMENT #3: Fuzzy search using "Children~2"
# Goal: Match documents containing the term "Children" with up to 2 character differences
# (e.g., matches "Chidren", "Childen", "Childrin", etc.)
# This increases robustness to typos and word form variations.
# Fields searched: "Facility Type", "Violations", "DBA Name"
# -------------------------------------------------------------

query = {
    'size': 10000,  # Return up to 10,000 results
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},  # Include only failed inspections
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},  # Include only high-risk facilities
                {
                    "query_string": {
                        "query": "Children~2",  # Fuzzy match: allows 2 character changes from "Children"
                        "fields": ["Facility Type", "Violations", "DBA Name"]  # Fields to search
                    }
                }
            ]              
        }
    }
}

# Execute the search on the 'food_inspections' index
# Keep the scroll context open for 1 hour to allow full paging of large results
results = es.search(index='food_inspections', body=query, scroll='1h')

In [67]:
# Retrieve the '_scroll_id' from the initial search results
# This ID is required to fetch the next batch of documents using the scroll API
sid = results['_scroll_id']

# Get the total number of documents that matched the fuzzy query for "Children~2"
scroll_size = results['hits']['total']

In [69]:
# Print the scroll ID for debugging or tracking scroll context
print('sid = ', sid)

# Print the total number of matched documents for Experiment #3
print('Scroll Size = ', scroll_size)

sid =  FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFlN6Vmgtc3B3VEo2NVc4d0hjNndMY2cAAAAAAAAAChZFekFQU1p3dVJkV0hTV1luX1BoRFpB
Scroll Size =  {'value': 602, 'relation': 'eq'}


In [71]:
# Extract the numeric value from the scroll_size dictionary (Elasticsearch 8+ format)
scroll_size = scroll_size['value']

In [73]:
# Initialize a counter to track how many documents have been processed
count = 0

# Initialize a list to store [Latitude, Longitude] coordinate pairs
list_of_lAT_LONG_pairs = []

# Continue scrolling through results while there are documents left to process
while scroll_size > 0:

    # Loop through each inspection document in the current batch
    for inspection in results['hits']['hits']:
        current_location_lAT_LONG = []  # Temporary list to store this document's coordinates
        document = inspection['_source']  # Extract the document content

        count += 1  # Increment the total document counter

        # Defensive checks to make sure required fields are present and not null
        if 'Latitude' in document and 'Longitude' in document and 'Address' in document:
            if document['Latitude'] is not None and document['Longitude'] is not None and document['Address'] is not None:
                # Convert to float and add to list of coordinates
                current_location_lAT_LONG.append(float(document['Latitude']))
                current_location_lAT_LONG.append(float(document['Longitude']))
                list_of_lAT_LONG_pairs.append(current_location_lAT_LONG)

    # Retrieve the next batch of results using the scroll ID
    results = es.scroll(scroll_id=sid, scroll='2m')

    # Update the scroll ID for the next iteration
    sid = results['_scroll_id']

    # Update scroll_size to reflect the number of documents in the new batch
    scroll_size = len(results['hits']['hits'])

# Print the final count of matched documents in this fuzzy search
print("Total number of matches with 'Children' using fuzziness:", count)

Total number of matches with 'Children' using fuzziness: 602


In [75]:
# Let's plot the query matches for "Children's" on a Chicago heatmap

# Create a folium map centered around Chicago
chicago_map = folium.Map(location=[41.90293279, -87.70769386], zoom_start=11)

# Add a heatmap layer using the list of [Latitude, Longitude] pairs
# Radius defines how much area each point influences
chicago_map.add_child(plugins.HeatMap(list_of_lAT_LONG_pairs, radius=15))

# Display the map
chicago_map

In [77]:
query = {
    'size': 10000,  # Return up to 10,000 documents matching the query

    'query': {
        'bool': {
            'should': [
                # Match specific types of facilities that serve children
                {'match_phrase': {'Facility Type': 'Daycare (2 - 6 Years)'}},
                {'match_phrase': {'Facility Type': 'Daycare Above and Under 2 Years'}},
                {'match_phrase': {'Facility Type': 'CHILDRENS SERVICES FACILITY'}},
            ],
            'minimum_should_match': 1,  # At least one facility type must match

            'filter': [
                {'match': {'Results': 'Fail'}},  # Must have failed an inspection
                {'match_phrase': {'Risk': 'Risk 1 (High)'}}  # Must be labeled as high-risk
            ]
        }
    },

    # Aggregation: Find facilities ('DBA Name') with at least 5 failed inspections
    'aggs': {
        'selected_dbas': {
            'terms': {
                'field': 'DBA Name.keyword',  # Group by exact name (not analyzed text)
                'min_doc_count': 5,  # Only return groups with 5+ failed inspections
                'size': 10000  # Return up to 10,000 groups
            },
            'aggs': {
                'top_dba_hits': {
                    'top_hits': {
                        'size': 10  # Include up to 10 inspection records per facility
                    }
                }
            }
        }
    }
}

# Run the search on the 'food_inspections' index with scroll context open for 1 hour
results = es.search(index='food_inspections', body=query, scroll='1h')

In [79]:
results # Dispaly the results.

ObjectApiResponse({'_scroll_id': 'FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFlN6Vmgtc3B3VEo2NVc4d0hjNndMY2cAAAAAAAAACxZFekFQU1p3dVJkV0hTV1luX1BoRFpB', 'took': 306, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 819, 'relation': 'eq'}, 'max_score': 12.486206, 'hits': [{'_index': 'food_inspections', '_id': '2116981', '_score': 12.486206, '_ignored': ['Violations.keyword'], '_source': {'Inspection ID': 2116981, 'DBA Name': 'PATHWAYS TO LEARNING CHILD CARE CENTER', 'AKA Name': 'PATHWAYS TO LEARNING CHILD CARE CENTER', 'License #': 2215780.0, 'Facility Type': 'CHILDRENS SERVICES FACILITY', 'Risk': 'Risk 1 (High)', 'Address': '3450 - 3454 W 79TH ST ', 'City': 'CHICAGO', 'State': 'IL', 'Zip': 60652.0, 'Inspection Date': '12/11/2017', 'Inspection Type': 'Canvass', 'Results': 'Fail', 'Violations': '18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO

In [81]:
# Initialize a list to store [Latitude, Longitude] pairs for mapping
list_of_lAT_LONG_pairs = []

# Loop through each bucket (facility) returned by the aggregation
for dba_bucket in results["aggregations"]["selected_dbas"]["buckets"]:
    
    # Check if top hits exist for the current facility
    if "top_dba_hits" in dba_bucket and "hits" in dba_bucket["top_dba_hits"] and "hits" in dba_bucket["top_dba_hits"]["hits"]:
        
        # Loop through each document in the top hits
        for hit in dba_bucket["top_dba_hits"]["hits"]["hits"]:
            
            # Make sure the document source exists
            if "_source" in hit:
                
                # Check if both Latitude and Longitude are present
                if "Latitude" in hit["_source"] and "Longitude" in hit["_source"]:
                    
                    # Append the [Latitude, Longitude] pair to the list
                    list_of_lAT_LONG_pairs.append([
                        hit["_source"]["Latitude"],
                        hit["_source"]["Longitude"]
                    ])

In [83]:
# ----------------------------------------------------------
# Dump top hits from each bucket into a single DataFrame
# Result: A flat table of all top violating inspection records,
#         annotated with total failure counts per facility.
# ----------------------------------------------------------

# Initialize a list to store per-record DataFrames
df_list = []

# Iterate through each 'bucket' returned from the aggregation (each facility)
for dba_bucket in results["aggregations"]["selected_dbas"]["buckets"]:
    
    # Confirm the structure exists before accessing
    if "top_dba_hits" in dba_bucket and "hits" in dba_bucket["top_dba_hits"] and "hits" in dba_bucket["top_dba_hits"]["hits"]:
        
        # Total number of failed inspections for this facility
        doc_count = dba_bucket['doc_count']
        
        # Iterate through each document (inspection record) in the top hits
        for hit in dba_bucket["top_dba_hits"]["hits"]["hits"]:
            score = hit['_score']  # Relevance score (not always useful in aggregations)

            # Check that the actual data exists
            if "_source" in hit:
                # Create a one-row DataFrame for this inspection record
                df_frequent_violator = pd.DataFrame([hit['_source']])

                # Annotate with metadata from the bucket
                df_frequent_violator['doc_count'] = doc_count
                df_frequent_violator['score'] = score

                # Add to the list of DataFrames
                df_list.append(df_frequent_violator)

# Combine all inspection records into a single DataFrame
df_top_frequent_violators = pd.concat(df_list, ignore_index=True)

In [85]:
df_top_frequent_violators

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location,doc_count,score
0,1386187,BUSY BUMBLE BEE ACADEMY DAYCARE,BUSY BUMBLE BEE ACADEMY DAYCARE,2215472.0,Daycare (2 - 6 Years),Risk 1 (High),6450 S COTTAGE GROVE AVE,CHICAGO,IL,60637.0,06/08/2015,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.777092,-87.606004,"(41.777092394888655, -87.60600377956905)",9,8.070202
1,1515476,BUSY BUMBLE BEE ACADEMY DAYCARE,BUSY BUMBLE BEE ACADEMY DAYCARE,2215472.0,Daycare (2 - 6 Years),Risk 1 (High),6450 S COTTAGE GROVE AVE,CHICAGO,IL,60637.0,12/29/2014,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.777092,-87.606004,"(41.777092394888655, -87.60600377956905)",9,8.070202
2,1319663,BUSY BUMBLE BEE ACADEMY DAYCARE,BUSY BUMBLE BEE ACADEMY DAYCARE,2215472.0,Daycare (2 - 6 Years),Risk 1 (High),6450 S COTTAGE GROVE AVE,CHICAGO,IL,60637.0,07/17/2013,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.777092,-87.606004,"(41.777092394888655, -87.60600377956905)",9,8.070202
3,1229852,BUSY BUMBLE BEE ACADEMY DAYCARE,BUSY BUMBLE BEE ACADEMY DAYCARE,1194190.0,Daycare (2 - 6 Years),Risk 1 (High),6450 S COTTAGE GROVE AVE,CHICAGO,IL,60637.0,06/28/2012,Canvass Re-Inspection,Fail,"14. PREVIOUS SERIOUS VIOLATION CORRECTED, 7-42...",41.777092,-87.606004,"(41.777092394888655, -87.60600377956905)",9,8.070202
4,1229850,BUSY BUMBLE BEE ACADEMY DAYCARE,BUSY BUMBLE BEE ACADEMY DAYCARE,3793.0,Daycare (2 - 6 Years),Risk 1 (High),6450 S COTTAGE GROVE AVE,CHICAGO,IL,60637.0,06/28/2012,Canvass Re-Inspection,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.777092,-87.606004,"(41.777092394888655, -87.60600377956905)",9,8.070202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,1235065,THE EDSEL ALBERT AMMONS NURSER,THE EDSEL ALBERT AMMONS NURSER,15803.0,Daycare (2 - 6 Years),Risk 1 (High),549 E 76TH ST,CHICAGO,IL,60619.0,09/06/2012,License Re-Inspection,Fail,38. VENTILATION: ROOMS AND EQUIPMENT VENTED AS...,41.756551,-87.610690,"(41.75655095611123, -87.61068980246957)",5,8.070202
147,1158446,THE EDSEL ALBERT AMMONS NURSER,THE EDSEL ALBERT AMMONS NURSER,15803.0,Daycare (2 - 6 Years),Risk 1 (High),549 E 76TH ST,CHICAGO,IL,60619.0,09/04/2012,License Re-Inspection,Fail,"14. PREVIOUS SERIOUS VIOLATION CORRECTED, 7-42...",41.756551,-87.610690,"(41.75655095611123, -87.61068980246957)",5,8.070202
148,1234922,THE EDSEL ALBERT AMMONS NURSER,THE EDSEL ALBERT AMMONS NURSER,15803.0,Daycare (2 - 6 Years),Risk 1 (High),549 E 76TH ST,CHICAGO,IL,60619.0,08/28/2012,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.756551,-87.610690,"(41.75655095611123, -87.61068980246957)",5,8.070202
149,545232,THE EDSEL ALBERT AMMONS NURSER,THE EDSEL ALBERT AMMONS NURSER,15803.0,Daycare (2 - 6 Years),Risk 1 (High),549 E 76TH ST,CHICAGO,IL,60619.0,02/18/2011,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.756551,-87.610690,"(41.75655095611123, -87.61068980246957)",5,8.070202


In [87]:

# Let's display the number of violations for each DBA NAME.

df_top_frequent_violators['DBA Name'].value_counts()


DBA Name
BUSY BUMBLE BEE ACADEMY DAYCARE                          9
BOTTLES TO BOOKS LEARNING CENTER                         8
A CHILD'S WORLD EARLY LEARNING CENTER                    7
AMAZING GRACE DAYCARE CENTER                             7
KIDS R FIRST LEARNING ACADEMY                            6
Little People's Day Care & Kindergarten, Inc.            6
LITTLE KIDS VILLAGE LEARNING                             6
LINCOLN KING DAY CARE                                    6
THE WORLD IS YOUR'S CHILD CARE & LEARNING CENTER INC.    6
JELLYBEAN LEARNING CENTER                                6
EARLY CHILDHOOD EDUCARE CENTER                           6
DISCOVERY LEARNING ACADEMY, INC.                         6
COMMONWEALTH DAYCARE CENTER                              6
FIRMAN COMMUNITY SERVICES                                6
KENYATTA'S DAYCARE                                       5
THE CRYSTAL PALACE EARLY LITERACY ZONE                   5
MONTESSORI ACDY. INFT/TOD. CNT                 

In [89]:

# Lets plot the top frequent violators on Chicago HeatMap

chicago_map = folium.Map([41.90293279, -87.70769386], zoom_start=11)
chicago_map.add_child(plugins.HeatMap(list_of_lAT_LONG_pairs, radius=15))
chicago_map


In [91]:
# Initialize count and scroll ID
count = 0
sid = results['_scroll_id']

# Handle both Elasticsearch 7.x and 8.x total hit formats
if isinstance(results['hits']['total'], dict):
    scroll_size = results['hits']['total']['value']
else:
    scroll_size = results['hits']['total']

# Loop to scroll through all matching documents
while scroll_size > 0:
    for doc in results['hits']['hits']:
        count += 1

    # Scroll to the next page
    results = es.scroll(scroll_id=sid, scroll='2m')
    sid = results['_scroll_id']
    scroll_size = len(results['hits']['hits'])

# Print the final document count for this experiment
print("Total document count for this experiment:", count)

Total document count for this experiment: 819


In [93]:
# Rerun Experiment #2 query (fuzzy match with "Children's~2")
query = {
    'size': 10000,
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},
                {
                    "query_string": {
                        "query": "Children's~2",
                        "fields": ["Facility Type", "Violations", "DBA Name"]
                    }
                }
            ]              
        }
    }
}

# Run the query
results = es.search(index='food_inspections', body=query, scroll='1h')

# Initialize counter and scroll context
count = 0
sid = results['_scroll_id']

# Handle ES 8.x format for total hits
if isinstance(results['hits']['total'], dict):
    scroll_size = results['hits']['total']['value']
else:
    scroll_size = results['hits']['total']

# Scroll loop to count all matching documents
while scroll_size > 0:
    for doc in results['hits']['hits']:
        count += 1

    results = es.scroll(scroll_id=sid, scroll='2m')
    sid = results['_scroll_id']
    scroll_size = len(results['hits']['hits'])

# Show final count
print("Total document count for Experiment 2 (Children's~2):", count)

Total document count for Experiment 2 (Children's~2): 611


In [95]:
# Experiment #1: Wildcard search using "*Children*"
query = {
    'size': 10000,
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},
                {
                    "query_string": {
                        "query": "*Children*",
                        "fields": ["Facility Type", "Violations", "DBA Name"]
                    }
                }
            ]
        }
    }
}

# Run the query
results = es.search(index='food_inspections', body=query, scroll='1h')

# Initialize counter and scroll context
count = 0
sid = results['_scroll_id']

# Handle ES 8.x format for total hits
if isinstance(results['hits']['total'], dict):
    scroll_size = results['hits']['total']['value']
else:
    scroll_size = results['hits']['total']

# Scroll loop to count all matching documents
while scroll_size > 0:
    for doc in results['hits']['hits']:
        count += 1

    results = es.scroll(scroll_id=sid, scroll='2m')
    sid = results['_scroll_id']
    scroll_size = len(results['hits']['hits'])

# Show final count
print("Total document count for Experiment 1 (*Children*):", count)

Total document count for Experiment 1 (*Children*): 601


In [99]:
# Assign the experiment counts based on your scroll loop results
exp1_count = 601   # From wildcard "*Children*"
exp2_count = 611   # From fuzzy "Children's~2"
exp3_count = 819   # From fuzzy "Children~2"

In [101]:
# Print total matches from each experiment
print("Experiment 1 - Wildcard '*Children*':", exp1_count)
print("Experiment 2 - Fuzzy 'Children's~2':", exp2_count)
print("Experiment 3 - Fuzzy 'Children~2':", exp3_count)

# Calculate relative differences
print("\nRelative Differences:")
print("Experiment 2 is {:.1f}% of Experiment 1".format(exp2_count / exp1_count * 100))
print("Experiment 3 is {:.1f}% of Experiment 1".format(exp3_count / exp1_count * 100))
print("Experiment 3 is {:.1f}% of Experiment 2".format(exp3_count / exp2_count * 100))

Experiment 1 - Wildcard '*Children*': 601
Experiment 2 - Fuzzy 'Children's~2': 611
Experiment 3 - Fuzzy 'Children~2': 819

Relative Differences:
Experiment 2 is 101.7% of Experiment 1
Experiment 3 is 136.3% of Experiment 1
Experiment 3 is 134.0% of Experiment 2


In [105]:
# Requirement 1: Comparative Analysis of Experiments

# We ran three search strategies to identify documents related to "Children"
# in the food_inspections index. Each query searched the fields:
# - Facility Type
# - Violations
# - DBA Name
# All results were filtered by:
# - Results = "Fail"
# - Risk = "Risk 1 (High)"

# Match Count Results:
# Experiment 1 – Wildcard "*Children*": 601 matches
# Experiment 2 – Fuzzy "Children's~2": 611 matches
# Experiment 3 – Fuzzy "Children~2": 819 matches

# Relative Differences:
# - Experiment 2 returned 101.7% as many matches as Experiment 1 (1.7% more)
# - Experiment 3 returned 136.3% as many matches as Experiment 1 (36.3% more)
# - Experiment 3 returned 134.0% as many matches as Experiment 2 (34.0% more)

# Summmary:
# - Experiment 1 (Wildcard) is exact but not typo-tolerant.
# - Experiment 2 (Fuzzy on "Children's") slightly expands results by handling spelling variation.
# - Experiment 3 (Fuzzy on "Children") returns the broadest result set and is more resilient
#   to typos, pluralization, or malformed entries.
# - This highlights the trade-off between precision and recall, especially in real-world datasets
#   where text inconsistencies are common.

In [107]:
# Requirement 2 - Experiment #1: Wildcard search using "*Child*"

query = {
    'size': 10000,
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},
                {
                    "query_string": {
                        "query": "*Child*",
                        "fields": ["Facility Type", "Violations", "DBA Name"]
                    }
                }
            ]
        }
    }
}

# Run the query
results = es.search(index='food_inspections', body=query, scroll='1h')

# Initialize counter and scroll context
count = 0
sid = results['_scroll_id']

# Handle ES 8.x total hit format
if isinstance(results['hits']['total'], dict):
    scroll_size = results['hits']['total']['value']
else:
    scroll_size = results['hits']['total']

# Scroll loop to count all matching documents
while scroll_size > 0:
    for doc in results['hits']['hits']:
        count += 1

    results = es.scroll(scroll_id=sid, scroll='2m')
    sid = results['_scroll_id']
    scroll_size = len(results['hits']['hits'])

# Print final count
print("Requirement 2 - Experiment 1 (Wildcard '*Child*'):", count)

Requirement 2 - Experiment 1 (Wildcard '*Child*'): 774


In [109]:
# Requirement 2 - Experiment #2: Fuzzy search using "Child's~2"

query = {
    'size': 10000,
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},
                {
                    "query_string": {
                        "query": "Child's~2",
                        "fields": ["Facility Type", "Violations", "DBA Name"]
                    }
                }
            ]
        }
    }
}

# Run the query
results = es.search(index='food_inspections', body=query, scroll='1h')

# Initialize counter and scroll context
count = 0
sid = results['_scroll_id']

# Handle ES 8.x total hit format
if isinstance(results['hits']['total'], dict):
    scroll_size = results['hits']['total']['value']
else:
    scroll_size = results['hits']['total']

# Scroll loop to count all matching documents
while scroll_size > 0:
    for doc in results['hits']['hits']:
        count += 1

    results = es.scroll(scroll_id=sid, scroll='2m')
    sid = results['_scroll_id']
    scroll_size = len(results['hits']['hits'])

# Print final count
print("Requirement 2 - Experiment 2 (Fuzzy 'Child's~2'):", count)

Requirement 2 - Experiment 2 (Fuzzy 'Child's~2'): 212


In [111]:
# Requirement 2 - Experiment #3: Fuzzy search using "Child~2"

query = {
    'size': 10000,
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},
                {
                    "query_string": {
                        "query": "Child~2",
                        "fields": ["Facility Type", "Violations", "DBA Name"]
                    }
                }
            ]
        }
    }
}

# Run the query
results = es.search(index='food_inspections', body=query, scroll='1h')

# Initialize counter and scroll context
count = 0
sid = results['_scroll_id']

# Handle ES 8.x total hit format
if isinstance(results['hits']['total'], dict):
    scroll_size = results['hits']['total']['value']
else:
    scroll_size = results['hits']['total']

# Scroll loop to count all matching documents
while scroll_size > 0:
    for doc in results['hits']['hits']:
        count += 1

    results = es.scroll(scroll_id=sid, scroll='2m')
    sid = results['_scroll_id']
    scroll_size = len(results['hits']['hits'])

# Print final count
print("Requirement 2 - Experiment 3 (Fuzzy 'Child~2'):", count)

Requirement 2 - Experiment 3 (Fuzzy 'Child~2'): 6555


In [113]:
req2_exp1_count = 774    # Wildcard "*Child*"
req2_exp2_count = 212    # Fuzzy "Child's~2"
req2_exp3_count = 6555   # Fuzzy "Child~2"

In [115]:
# Print total matches from each Requirement 2 experiment
print("Requirement 2 - Experiment 1 (Wildcard '*Child*'):", req2_exp1_count)
print("Requirement 2 - Experiment 2 (Fuzzy 'Child's~2'):", req2_exp2_count)
print("Requirement 2 - Experiment 3 (Fuzzy 'Child~2'):", req2_exp3_count)

# Calculate relative differences
print("\nRelative Differences:")
print("Experiment 2 is {:.1f}% of Experiment 1".format(req2_exp2_count / req2_exp1_count * 100))
print("Experiment 3 is {:.1f}% of Experiment 1".format(req2_exp3_count / req2_exp1_count * 100))
print("Experiment 3 is {:.1f}% of Experiment 2".format(req2_exp3_count / req2_exp2_count * 100))

Requirement 2 - Experiment 1 (Wildcard '*Child*'): 774
Requirement 2 - Experiment 2 (Fuzzy 'Child's~2'): 212
Requirement 2 - Experiment 3 (Fuzzy 'Child~2'): 6555

Relative Differences:
Experiment 2 is 27.4% of Experiment 1
Experiment 3 is 846.9% of Experiment 1
Experiment 3 is 3092.0% of Experiment 2


In [None]:
# Requirement 2: Comparative Analysis of Experiments (using "Child")

# We repeated the same three search strategies from Requirement 1,
# this time searching for references to "Child" instead of "Children".
# All queries were executed against the food_inspections index, searching:
# - Facility Type
# - Violations
# - DBA Name
# And filtered by:
# - Results = "Fail"
# - Risk = "Risk 1 (High)"

# Match Count Results:
# Experiment 1 – Wildcard "*Child*": 774 matches
# Experiment 2 – Fuzzy "Child's~2": 212 matches
# Experiment 3 – Fuzzy "Child~2": 6555 matches

# Relative Differences:
# - Experiment 2 returned 27.4% as many matches as Experiment 1 (72.6% fewer)
# - Experiment 3 returned 846.9% as many matches as Experiment 1 (almost 8.5x more)
# - Experiment 3 returned 3092.0% as many matches as Experiment 2 (over 30x more)

# Summary:
# - Compared to Requirement 1 (which used the term "Children"), the results in Requirement 2
#   show much more variability — especially with fuzzy matching.
# - The wildcard "*Child*" returned more results than "*Children*", suggesting that the substring
#   "Child" appears more broadly across documents, including words like "Childcare" or "Childhood".
# - The fuzzy search on "Child~2" returned over 6500 matches — far more than any query in Requirement 1.
#   This dramatic increase is likely due to the short base term "Child" matching a wide range of terms
#   with even small edits (e.g., "Chill", "Build", "Chid", etc.).
# - This highlights a key difference: short base terms with fuzziness can significantly inflate result sets,
#   making it harder to control precision compared to longer, more specific base terms like "Children".
# - In summary, Requirement 2 illustrates how query design must consider base word length and fuzziness sensitivity,
#   especially when working with real-world text data.

In [117]:
#Rewuirement 3: Table of frequent violators

# Step 1: Total number of violations per DBA
# (Count non-null Violations grouped by DBA Name)
violations_count = (
    df_top_frequent_violators
    .groupby('DBA Name')['Violations']
    .count()
    .reset_index()
    .rename(columns={'Violations': 'Total Violations'})
)

# Step 2: Total number of unique licenses per DBA
licenses_count = (
    df_top_frequent_violators
    .groupby('DBA Name')['License #']
    .nunique()
    .reset_index()
    .rename(columns={'License #': 'Licenses Issued'})
)

# Step 3: Merge both results on DBA Name
df_summary = pd.merge(violations_count, licenses_count, on='DBA Name')

# Display the final result
df_summary.head()

Unnamed: 0,DBA Name,Total Violations,Licenses Issued
0,A CHILD'S WORLD EARLY LEARNING CENTER,7,2
1,ADA S MCKINLEY MAGGIE DRUMMON,5,2
2,AMAZING GRACE DAYCARE CENTER,7,2
3,ANGELS,4,2
4,BOTTLES TO BOOKS LEARNING CENTER,8,2


In [121]:
# Requirement 4: 3 or more DBA Licenses

# Step 1: Filter for facilities with 3 or more licenses
frequent_license_dbas = df_summary[df_summary['Licenses Issued'] >= 3]['DBA Name']

In [123]:
# Step 2: Keep only matching rows from original dataset
df_heatmap_data = df_top_frequent_violators[
    df_top_frequent_violators['DBA Name'].isin(frequent_license_dbas)
]

In [125]:
# Step 3: Build list of [lat, lon] coordinates for heatmap
heatmap_coords = []

for _, row in df_heatmap_data.iterrows():
    if pd.notnull(row['Latitude']) and pd.notnull(row['Longitude']):
        heatmap_coords.append([float(row['Latitude']), float(row['Longitude'])])

In [127]:
import folium
from folium import plugins

# Step 4: Create the map and add heatmap layer
chicago_map = folium.Map(location=[41.8781, -87.6298], zoom_start=11)
chicago_map.add_child(plugins.HeatMap(heatmap_coords, radius=15))

# Display the map
chicago_map

In [129]:
#Requirement 5: Mice droppings

query = {
    'size': 10000,
    'query': {
        'bool': {
            'must': [
                {'match': {'Results': 'Fail'}},
                {'match_phrase': {'Risk': 'Risk 1 (High)'}},
                {
                    'query_string': {
                        'query': '"MICE DROPPINGS"',
                        'fields': ['Violations']
                    }
                },
                {
                    'query_string': {
                        'query': 'Daycare OR Children',
                        'fields': ['Facility Type']
                    }
                }
            ],
            'must_not': [
                {
                    'match_phrase': {
                        'Violations': 'NO MICE DROPPINGS WERE OBSERVED'
                    }
                }
            ]
        }
    }
}

# Run the search with scroll
results = es.search(index='food_inspections', body=query, scroll='1h')

In [131]:
# Initialize scroll
sid = results['_scroll_id']
scroll_size = results['hits']['total']['value']
hits = [hit['_source'] for hit in results['hits']['hits']]

# Scroll to collect all matching documents
while scroll_size > 0:
    results = es.scroll(scroll_id=sid, scroll='2m')
    sid = results['_scroll_id']
    scroll_size = len(results['hits']['hits'])
    hits.extend([hit['_source'] for hit in results['hits']['hits']])

In [133]:
# Extract lat/lon pairs
mice_droppings_coords = []

for doc in hits:
    if 'Latitude' in doc and 'Longitude' in doc:
        if doc['Latitude'] and doc['Longitude']:
            mice_droppings_coords.append([float(doc['Latitude']), float(doc['Longitude'])])

In [135]:
# Create the map centered on Chicago
mice_map = folium.Map(location=[41.8781, -87.6298], zoom_start=11)

# Add heat layer
mice_map.add_child(plugins.HeatMap(mice_droppings_coords, radius=15))

# Display the map
mice_map