# Query Notebook

In [None]:
from math import radians, sin, cos, sqrt, atan2

import pandas as pd

In [None]:
SAMPLE = False

In [None]:
# This is how our queries will be formatted. They will later be created with
# user input
query = {
    "user": "4ZaqBJqt7laPPs8xfWvr6A",
    "location": (41.5074, 81.6096),
    "type": "restaurants",
    "keywords": ["abby", "doctors", "target", "shopping", "fashion", "nails", "bars", "food"],
    "literals": {"distance" : 100, "rating" : 1},
    "k": 5
}

Program flow

1. Query the database for a list of locations that match the type of location
2. Narrow down the list to make sure they match the literals
3. Match keywords to the location
4. Apply the friend rating filter
5. Return K results

# Literal Processing

When considering the literal processing, this creates an new set that narrows down the amount of businesses that are going to be chosen by its 'literal' or global attribute (distance by radius & global rating number)

In [None]:
# Loading in the cleaned JSON business file

with open(f"data/02_postprocessing/{'sample/' if SAMPLE else ''}businesses.csv") as f:
    businesses = pd.read_csv(f, header=0)
businesses.info()


In [None]:
# calculating the difference between two different points of lat & long
# units = km
def dist(lat1, long1, lat2, long2):
    earthRadius = 6371.0

    # converting the latitude and longitude from degrees to radians
    # lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
    lat1 = radians(lat1)
    long1 = radians(long1)
    lat2 = radians(lat2)
    long2 = radians(long2)

    # finding the distance between respective latitudes and longitudes
    latDist = lat2 - lat1
    longDist = long2 - long1

    # use of the haversine formula
    a = sin(latDist / 2)**2 + cos(lat1) * cos(lat2) * sin(longDist / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    # Calculate the distance
    distance = earthRadius * c

    return distance

In [None]:
# Get distances
businesses["distance"] = businesses.apply(lambda row: dist(query["location"][0], query["location"][1], row["latitude"], row["longitude"]), axis = 1)
businesses["distance"] = businesses["distance"] / 100 # convert from meters to km
businesses.head()

In [None]:
# Check rating
businesses["rating_check"] = businesses["stars"] >= query["literals"]["rating"]
businesses.head()

In [None]:
# Combine literals
businesses["literals"] = businesses["rating_check"] & (businesses["distance"] <= query["literals"]["distance"])
businesses[list(set(businesses.columns) - {"distance", "rating_check"})].loc[businesses["literals"]].head()

# Type Matching

In [None]:
# Make type lower case
businesses["type"] = businesses["type"].str.lower()
query["type"] = query["type"].lower()

In [None]:
businesses["type_check"] = businesses["type"] == query["type"]
businesses.loc[businesses["type_check"]].head()

# Keyword Extraction

In [None]:
# read businesses_keywords.csv file
with open("data/02_postprocessing/businesses_keywords.csv") as f:
    business_keywords_df = pd.read_csv(f, header=0)

In [None]:
keywords = query["keywords"]
# Filter businesses based on keywords
businesses_filtered = business_keywords_df[business_keywords_df["keyword"].isin(keywords)]
# Count the number of keywords matched
businesses_filtered_count = businesses_filtered.groupby("business_id").count().reset_index()
# Rename the column
businesses_filtered_count = businesses_filtered_count.rename(columns={"keyword": "keywords_matched"})
# Normalize the count of keywords matched
min_count = businesses_filtered_count["keywords_matched"].min()
max_count = businesses_filtered_count["keywords_matched"].max()
businesses_filtered_count["norm_keywords_matched"] = (((businesses_filtered_count["keywords_matched"] - min_count) / (max_count - min_count))) 
# Sort the dataframe based on the normalized count of keywords matched
businesses_filtered_count = businesses_filtered_count.sort_values(by="norm_keywords_matched", ascending=False)

In [None]:
businesses = businesses.merge(businesses_filtered_count, on="business_id", how="left")
# fill na with 0
businesses["norm_keywords_matched"] = businesses["norm_keywords_matched"].fillna(0)
businesses.head()

In [None]:
businesses.sort_values(by="norm_keywords_matched", ascending=False).head(query["k"])[["name", "stars", "latitude", "longitude", "distance"]]

And these are our top K results