Data Organization & Processing

In [16]:
# Gets necessary imports
import numpy as np
import heapq
import pandas as pd
import matplotlib.pyplot as plt

boba_data_set = pd.read_csv('bayarea_boba_spots.csv') # Reads the boba data set

# Organizes the boba data into their own individual variables
shop_names = boba_data_set['name'] # Gets all shop names
ratings = boba_data_set['rating'] # Gets all ratings
cities = boba_data_set['city'] # Gets all cities
latitudes = boba_data_set['lat'] # Gets the latitudes
longitudes = boba_data_set['long'] # Gets the Longitudes

# Hashamp(or dictionary) is used to pair the names with the scores for easy access, and averages any repeated shop names
# In each pair, the key will represent the store name, and the value will be a 
# numpy array holding two values: First index will be the average ratings,
# and the second will be the total quantity of that specific shop
hashmap = {}
for i in range(len(shop_names)):
    # Checks if the current shop already exist inside the hashmap, and add that to our total value.
    if shop_names[i] in hashmap:
        shop_arr = hashmap[shop_names[i]]
        shop_arr[1] += 1
        shop_arr[0] += float(ratings[i])
    # Initializes a new shop if it is not already present inside the hashmap
    else:
        new_arr = [0,0]
        new_arr[0] = float(ratings[i])
        new_arr[1] = 1
        hashmap[shop_names[i]] = new_arr

# We now average all the ratings if they have more than one shop 
for current in hashmap.keys():
    pair = hashmap[current]
    avg = pair[0] / pair[1]
    hashmap[current][0] = avg


housing_data_set = pd.read_csv('housing.csv') # Reads the housing data set

# Organizes the housing data into their own individual variables
longitudes_house = housing_data_set['longitude'] # Gets the longitudes of the houses
latitudes_house = housing_data_set['latitude'] # Gets the latitidues of the houses
house_values = housing_data_set['median_house_value'] # Gets the median house values

In [20]:
# First we will do an amounts bar graph visualization where the x axis will represent boba shops,
#  and the y axis will represent the rating

# We will graph the top 25 and the lowest 25 rating stores since there are so many.


# Top 25 Scores:

# We will use a priority queue for this portion since its easier to get the top 25 values
prio_queue = []
top_25_stores = np.zeros(25)
for i in range(len(shop_names)):
    new_list = [0,0,0]
    pair = hashmap[shop_names[i]]
    new_list[0] = pair[0] * -1
    new_list[1] = pair[1] * - 1
    new_list[2] = shop_names[i]
    heapq.heappush(prio_queue, new_list)
print(prio_queue)

[[-5.0, -1, 'Bobateani'], [-5.0, -1, 'Honey Bear Smoothie Tea & Dessert'], [-5.0, -1, 'Golden Bakery'], [-5.0, -1, 'Puppy Bobar'], [-5.0, -1, 'Taza Deli & Cafe'], [-4.5, -1, "Antoine's Cookie Shop"], [-5.0, -1, 'Waterfront Cafe'], [-5.0, -1, 'QTeaBar'], [-4.5, -1, '5 Sweets'], [-4.5, -1, 'DAVIDsTEA'], [-4.5, -1, '99% Tea House'], [-4.5, -1, 'Mints & Honey'], [-4.5, -1, 'Aqua Club Dessert & Beverage'], [-4.5, -1, 'Easel'], [-4.5, -1, 'Aung Maylika'], [-4.5, -1, 'Alice Street Bakery Café'], [-4.5, -1, 'Eat On Monday'], [-4.5, -1, 'Banh Mi Ba Le'], [-4.5, -1, 'Chilly & Munch'], [-4.5, -1, 'Sweet Gelato Tea Lounge'], [-4.5, -1, 'Happiness Cafe'], [-4.5, -1, 'Chantal Guillon Macarons & Teas'], [-4.5, -1, 'Keep it'], [-4.5, -1, 'Palm Thai Bistro'], [-4.5, -1, 'OMG Tea'], [-4.5, -1, 'BaoTea Cafe'], [-4.5, -1, 'Okashi Fusion'], [-4.5, -1, 'Steap Tea Bar'], [-4.5, -1, 'Going Green'], [-4.5, -1, 'Fusion Mix Frozen Yogurt'], [-4.5, -1, 'Blue Saigon'], [-4.5, -1, 'Calibear Cyber Cafe'], [-4.5, -1,