# Query Notebook

In [56]:
import json
from math import radians, sin, cos, sqrt, atan2

In [57]:
SAMPLE = True

In [60]:
# This is how our queries will be formatted. They will later be created with
# user input
query = {
    "user": "4ZaqBJqt7laPPs8xfWvr6A",
    "types": ["restaurants"], # TODO I think this should be just one value out a list of options to make our work easier -Joshua
    "keywords": ["abby", "doctors", "target", "shopping", "fashion", "nails", "bars", "food"],
    #"literals": [("distance", "less than", 2, "latitude", "longitude"), ("rating", "greater than", 3)]
    "literals": [("latitude", "longitude", "distance number"), ("rating number")],
    "k": 5
}

Program flow

1. Query the database for a list of locations that match the type of location
2. Narrow down the list to make sure they match the literals
3. Match keywords to the location
4. Apply the friend rating filter
5. Return K results

# Literal Processing

When considering the literal processing, this creates an new set that narrows down the amount of businesses that are going to be chosen by its 'literal' or global attribute (distance by radius & global rating number)

In [16]:
# Loading in the cleaned JSON business file

with open(f"data/01_cleaned/sample/businesses.json") as f:
    businesses = [json.loads(line) for line in f]


FileNotFoundError: [Errno 2] No such file or directory: 'data/01_cleaned/sample/businesses.json'

# DO NOT RUN FILE BELOW

In [None]:

location_literal = {}
latitude = query["literals"]# TODO how to piece together the actual query as a value to enter

# calculating the difference between two different points of lat & long
# units = miles
def dist(lat1, long1, lat2, long2):
    earthRadius = 3958.76

    # converting the latitude and longitude from degrees to radians
    lat1, long1, lat2, long2 = map(radians, lat1, long1, lat2, long2)

    # finding the distance between respective latitudes and longitudes
    latDist = lat2 - lat1
    longDist = long2 - long1

    # use of the haversine formula
    a = sin(latDist / 2)**2 + cos(lat1) * cos(lat2) * sin(longDist / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    # Calculate the distance
    distance = earthRadius * c

    return distance

# returns all of the businesses that are less than or equal to the radius of the given latitude and longitude
# essentially applies distance function to business
def returnLocations(business, businessLat, businessLong, radius):
    for business in businesses:
        if business["review_count"] > 0:
            distance = distance(query["literals"], query["literals"], business["latitude"], business["longitude"])
            if distance <= radius:
                location_literal.append(business)

    return location_literal

# YOU CAN RUN THIS ONE LOL

In [None]:
location_literal = {}
rating_literal = {}

# The query distance is in miles
def distance(userLat, userLong, queryDist):
   earthRadius = 6371

   if queryDist == 0:
      print("Your distance cannot be zero")
   else:
      for business in businesses:
         businessLat = business["latitude"]
         businessLong = business["longitude"]
         userLat, userLong, businessLat, businessLong = map(radians, userLat, userLong, business["latitude"], business["longitude"])
         latDist = business["latitude"] - userLat
         longDist = business["longitude"] - userLong
         maxDist = earthRadius * c
         if maxDist <= queryDist & maxDist != 0:
            a = sin(latDist / 2)**2 + cos(userLat) * cos(business["latitude"]) * sin(longDist / 2)**2
            c = 2 * atan2(sqrt(a), sqrt(1 - a))
            location_literal.append(business)
         else:
            print("There are no " + query["types"] + " in your selected radius. Please increase your radius or just leave Ohio")

def returnRated(lowestRating):
   # TODO iterate through the new location_literal dictionary
   for business in location_literal: # this is just a placeholder, please fix how to iterate through created dictionary
      if business["review_count"] > 0:
         if business["stars"] >= lowestRating:
            rating_literal.append(business)
   return rating_literal

In [None]:
matched_literals = {}



# Keyword Extraction

In [61]:
import pandas as pd

In [62]:
# read businesses_keywords.csv file
with open("data/02_postprocessing/businesses_keywords.csv") as f:
    business_keywords_df = pd.read_csv(f, header=0)

# read businesses.csv file
with open("data/02_postprocessing/businesses.csv") as f:
    business_df = pd.read_csv(f, header=0)

business_keywords_df


Unnamed: 0,business_id,keyword
0,Pns2l4eNsfO8kk83dixA6A,abby
1,Pns2l4eNsfO8kk83dixA6A,rappoport
2,Pns2l4eNsfO8kk83dixA6A,lac
3,Pns2l4eNsfO8kk83dixA6A,cmq
4,Pns2l4eNsfO8kk83dixA6A,doctors
...,...,...
1209363,zznZqH9CiAznbkV6fXyHWA,sandwich
1209364,zznZqH9CiAznbkV6fXyHWA,try
1209365,zznZqH9CiAznbkV6fXyHWA,definitely
1209366,zznZqH9CiAznbkV6fXyHWA,popup


In [66]:
# Filter businesses based on keywords and count the number of keywords matched
def filter_businesses(keywords):
    # Filter businesses based on keywords
    businesses_filtered = business_keywords_df[business_keywords_df["keyword"].isin(keywords)]

    # Count the number of keywords matched
    businesses_filtered_count = businesses_filtered.groupby("business_id").count().reset_index()

    # Rename the column
    businesses_filtered_count = businesses_filtered_count.rename(columns={"keyword": "keywords_matched"})

    # Normalize the count of keywords matched
    min_count = businesses_filtered_count["keywords_matched"].min()
    max_count = businesses_filtered_count["keywords_matched"].max()
    businesses_filtered_count["norm_keywords_matched"] = (((businesses_filtered_count["keywords_matched"] - min_count) / (max_count - min_count))) 

    # Sort the dataframe based on the normalized count of keywords matched
    businesses_filtered_count = businesses_filtered_count.sort_values(by="norm_keywords_matched", ascending=False)

    return businesses_filtered_count

# Example usage
result = filter_businesses(query["keywords"])
