In [1]:
# These are boiler plate imports that seem useful
# Perhaps cleaner would be to delete or comment out the ones that aren't used in this script...

import sys
import json
import csv
import yaml

import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import math

import matplotlib as mpl
import matplotlib.pyplot as plt

import itertools

import time
import datetime
# see https://stackoverflow.com/questions/415511/how-do-i-get-the-current-time-in-python
#   for some basics about datetime

import pprint

# sqlalchemy 2.0 documentation: https://www.sqlalchemy.org/
import psycopg2
from sqlalchemy import create_engine, text as sql_text

# the following is deprecated, it seems, so using the sqlalchemy
# from pyscopg2 import sqlio

# the file in benchmarking/util.py should hold utilities useful for your benchmarking exercise
# In this notebook we have commented out all mentions of util, so that you can run
#    this notebook before setting up your benchmarking/util.py file
sys.path.append('benchmarking/')
import util
# to invoke a function "foo()" inside util.py, use "util.foo()"

In [2]:
dotenv_path = 'benchmarking/.ipynb_checkpoints/variables.env'
load_dotenv(dotenv_path, override=True)

schema = os.getenv("SCHEMA")
port = os.getenv("PORT")
host = os.getenv("HOST")
database = os.getenv("DATABASE")
username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")

In [3]:
# following https://www.geeksforgeeks.org/connecting-postgresql-with-sqlalchemy-in-python/

db_eng = create_engine(f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}',
                       connect_args={'options': f'-csearch_path={schema}'},
                       isolation_level = 'SERIALIZABLE')
#    , echo=True)
#    , echo_pool="debug")

print("Successfully created db engine.")

# connect_args is used to set search_path to the schema 'new_york_city' in the airbnb database

# isolation_level SERIALIZABLE makes transactions happen in sequence, which is good 
#      for the benchmarking we will be doing

# for general info on sqlalchemy connections,
#    see: https://docs.sqlalchemy.org/en/20/core/connections.html

# echo from https://docs.sqlalchemy.org/en/20/core/engines.html

Successfully created db engine.


# Q3

In [4]:
def rename_keys(d):
    return {k.split('_')[-1] : v for k, v in d.items()}

def extract_value(data, val):
    values = {}
    for year, details in data.items():
        values[year] = {}
        for key, metrics in details.items():
            if val in metrics:
                values[year][key] = metrics[val]
    return values

In [5]:
data_update_datetimes = util.fetch_perf_data('update_datetimes_query.json')

In [6]:
axis_order = ['New Springville', 'Fort Hamilton', 'Long Island City', 'Bedford-Stuyvesant', 'Staten Island', 'Bronx', 'Queens', 'Manhattan']
renamed_data_update_datetimes = rename_keys(data_update_datetimes)
update_datetimes_avg = extract_value(renamed_data_update_datetimes, 'avg')
update_datetimes_avg_df = pd.DataFrame(update_datetimes_avg).T
update_datetimes_avg_df = update_datetimes_avg_df.reindex(axis_order)
update_datetimes_avg_df.rename(columns = {"__": "no_index",
    "__datetime_in_reviews__": "datetime_index",
    "__neigh_in_listings__": "neigh_index",
    "__datetime_in_reviews__neigh_in_listings__": "datetime_and_neigh_index"}
    , inplace = True)

In [7]:
update_datetimes_avg_df['Difference'] = update_datetimes_avg_df['datetime_and_neigh_index'] - update_datetimes_avg_df['no_index']

In [8]:
update_datetimes_avg_df['Difference'].sort_values()

New Springville       -0.0078
Fort Hamilton          0.0258
Staten Island          0.2422
Long Island City       0.2935
Bronx                  0.8004
Bedford-Stuyvesant     2.6757
Queens                11.0634
Manhattan             13.8692
Name: Difference, dtype: float64

In [9]:
query = """select l.neighbourhood as loc, count(*)
from reviews r, listings l
where l.id = r.listing_id
 and l.neighbourhood in ('New Springville', 'Fort Hamilton', 'Long Island City', 'Bedford-Stuyvesant')
group by loc
union
select l.neighbourhood_group as loc, count(*)
from reviews r, listings l
where l.id = r.listing_id
 and l.neighbourhood_group in ('Staten Island', 'Bronx', 'Queens', 'Manhattan')
group by loc
order by count;"""

with db_eng.connect() as conn:
    df = pd.read_sql_query(sql_text(query), conn)

In [23]:
df.index = df['loc']
df.drop(columns = 'loc', inplace = True)
df

Unnamed: 0,loc,count,Difference
0,New Springville,104,
1,Fort Hamilton,1000,
2,Long Island City,10859,
3,Staten Island,13726,
4,Bronx,35296,
5,Bedford-Stuyvesant,99705,
6,Queens,173392,
7,Manhattan,341287,


# Q4

In [41]:
data_listings = util.fetch_perf_data('listings_join_reviews.json')
data_listings = data_listings['q_listings_join_reviews_2019']

data_listings_no_index = data_listings['__']
data_listings_index = data_listings['__datetime_in_reviews__']

data_listings_no_index_avg = data_listings_no_index['avg']
data_listings_index_avg = data_listings_index['avg']

In [42]:
data_listings_no_index_avg, data_listings_index_avg

(1.1242, 0.2482)

In [43]:
# time saved by using index for 1 query
query_time_saved = data_listings_no_index_avg - data_listings_index_avg
print(f"Query: time saved by using index: {query_time_saved:.3f} seconds")

Query: time saved by using index: 0.876 seconds


In [44]:
# time saved by using index for 1000 queries
query_time_saved_1000 = query_time_saved * 1000
# convert to minutes and second
minutes = query_time_saved_1000 // 60
seconds = query_time_saved_1000 % 60
print(f"Query: time saved by using index for 1000 queries: {minutes:.0f} minutes {seconds:.0f} seconds")

Query: time saved by using index for 1000 queries: 14 minutes 36 seconds


In [45]:
data_update_manhattan = data_update_datetimes['update_datetimes_query_Manhattan']

data_update_no_index = data_update_manhattan['__']
data_update_index = data_update_manhattan['__datetime_in_reviews__']

data_update_no_index_avg = data_update_no_index['avg']
data_update_index_avg = data_update_index['avg']

In [46]:
data_update_no_index_avg, data_update_index_avg

(27.9098, 34.0193)

In [47]:
# time lost by using index for 1 update
update_time_lost = data_update_index_avg - data_update_no_index_avg
print(f"Update: time lost by using index: {update_time_lost:.4f} seconds")

Update: time lost by using index: 6.1095 seconds


In [48]:
# time lost by using index for 1000 updates
update_time_lost_1000 = update_time_lost * 1000
# convert seconds to hours: mintues: seconds
hours = update_time_lost_1000 // 3600
minutes = (update_time_lost_1000 % 3600) // 60
seconds = update_time_lost_1000 % 60
print(f"Update: time lost by using index for 1000 updates: {hours:.0f} hours, {minutes:.0f} minutes, {seconds:.1f} seconds")

Update: time lost by using index for 1000 updates: 1 hours, 41 minutes, 49.5 seconds


In [49]:
x = update_time_lost / (query_time_saved + update_time_lost)
print(f'x = {x:.4f}')

x = 0.8746
