In [29]:
import sys
import json
import csv
import os

import pandas as pd
import numpy as np

# import matplotlib as mpl

# useful for printing dict and list objects
import pprint

import time
from datetime import datetime

import psycopg2
from sqlalchemy import create_engine, text as sql_text
import psycopg2.extras
from dotenv import load_dotenv


In [30]:
load_dotenv()

#get user input
db_username = os.environ['db_username']
db_password = os.environ['db_password']
                             
db_eng = create_engine('postgresql+psycopg2://' + db_username + ':' + db_password + '@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('airbnb')},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

# time to run
n = 1

Successfully created db engine.


In [31]:
def ensure_indexes():
    check_datetime_sql = """
        SELECT 1 FROM pg_indexes 
        WHERE tablename = 'reviews' AND indexname = 'datetime_in_reviews';
    """
    check_neigh_sql = """
        SELECT 1 FROM pg_indexes 
        WHERE tablename = 'listings' AND indexname = 'neigh_in_listings';
    """
    
    with db_eng.begin() as conn:
        dt_result = conn.execute(sql_text(check_datetime_sql)).fetchone()
        neigh_result = conn.execute(sql_text(check_neigh_sql)).fetchone()
    
    dt_status = "datetime_in_reviews exists" if dt_result else "datetime_in_reviews index does not exist.\n"
    neigh_status = "neigh_in_listings exists" if neigh_result else "neigh_in_listings index does not exist.\n"
    
    return dt_status + "\n" + neigh_status

In [32]:
filename = 'test.json'

if os.path.exists(filename):
    try:
        # delete the file
        os.remove(filename)
        print(f"File '{filename}' successfully deleted.")
    except Exception as e:
        print(f"Error deleting file '{filename}': {e}")
else:
    print(f"File '{filename}' not found.")

#write perf data to file
def write_perf_data(new_data):
    
    #if file exists, read old data
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            try:
                old_data = json.load(f)  
            except json.JSONDecodeError:
                old_data = []  
    else:
        old_data = []

    # combine old and new data
    if isinstance(old_data, list):
        old_data.append(new_data)  
    else:
        old_data = [old_data, new_data]

    # write to file
    with open(filename, 'w') as f:
        json.dump(old_data, f, indent=4)

    print("wrote in file successfully")

File 'test.json' successfully deleted.


In [33]:
#write perf data to file

def write_perf_data(location, tag, result, save_path= filename):
    """
    - location: ex: "Bronx"
    - result: dict，stats from the query
    - save_path: JSON file path to save the results
    """
    loc_key = f"update_datetimes_query_{location}"

    if os.path.exists(save_path):
        with open(save_path, "r") as f:
            existing = json.load(f)
    else:
        existing = {}

    if loc_key not in existing:
        existing[loc_key] = {}

    existing[loc_key][tag] = result

    with open(save_path, "w") as f:
        json.dump(existing, f, indent=4)

    print(f"successfully wrote {tag} {location} to {save_path}")
    print("\n")



In [34]:
# update query based on two lists of location name

#collection of location names
    # list of neighbourhoods: "New Springville", "Fort Hamilton", "LongIslandCity", "Bedford-Stuyvesant"
    # list of neighbourhood_groups: "Staten Island", "Bronx", "Queens", "Manhattan"
location = ["New Springville", "Fort Hamilton", "Long Island City", "Bedford-Stuyvesant", "Staten Island", "Bronx", "Queens", "Manhattan"]
neigh = ["New Springville", "Fort Hamilton", "Long Island City", "Bedford-Stuyvesant"]
neigh_groups = ["Staten Island", "Bronx", "Queens", "Manhattan"]


#check if location is in neighbourhood or neighbourhood_group
# if location is in neighbourhood, return False
def check_group(location):
    if location in neigh:
        return False
    elif location in neigh_groups:
        return True
    else:
        raise ValueError(f"Location '{location}' is not in either neighbourhood or neighbourhood_group list.")


# index queries
# add date index 
q_create_datetime_index_in_reviews = '''
CREATE INDEX IF NOT EXISTS datetime_in_reviews
ON reviews(datetime);
'''

#drop date index
q_drop_datetime_index_in_reviews = '''
DROP INDEX IF EXISTS datetime_in_reviews;
'''

#add neighbourhood index
def q_create_neigh_in_listings(group):
    #group: neighbourhood(false) or neighbourhood_group(true)
    #location: neighbourhood or neighbourhood_group name

    if group == False:
        return '''
        CREATE INDEX IF NOT EXISTS neigh_in_listings
        ON listings(neighbourhood_cleansed);
        '''
    elif group == True:
        return '''
        CREATE INDEX IF NOT EXISTS neigh_groups_in_listings
        ON listings(neighbourhood_group_cleansed);
        '''
    
#drop neighbourhood index
def q_drop_neigh_in_listings(group):
    #group: neighbourhood(false) or neighbourhood_group(true)
    #location: neighbourhood or neighbourhood_group name

    if group == False:
        return '''
        DROP INDEX IF EXISTS neigh_in_listings;
        '''
    elif group == True:
        return '''
        DROP INDEX IF EXISTS neigh_groups_in_listings;
        '''


tags = ["__", "__datetime_in_reviews__", "__neigh_in_listings__", "__datetime_in_reviews__neigh_in_listings__"]

In [35]:
#construct query based on neighbourhood or neighbourhood_group, location and year

def update_reviews_query(a, location):

    '''
    a: update on neighbourhood or neighbourhood_group
    location: neighbourhood or neighbourhood_group
    '''

    # construct SQL query
    query = f"""
    UPDATE reviews r
    SET datetime = datetime + interval '5 days'
    FROM listings l
    WHERE l.id = r.listing_id
      AND l.{a}_cleansed = '{location}'
    RETURNING 'done';
    """
    
    return query


# undo update
def undo_update(a, location):
    '''
    a: update on neighbourhood or neighbourhood_group
    location: neighbourhood or neighbourhood_group
    '''

    # construct undo SQL query
    query = f"""
    UPDATE reviews r
    SET datetime = datetime - interval '5 days'
    FROM listings l
    WHERE l.id = r.listing_id
    AND l.{a}_cleansed = '{location}'
    RETURNING 'done';
    """
    
    return query


In [36]:
#combination 1： no index
def no_index(location):
    
    # check if location is in neighbourhood or neighbourhood_group
    group = check_group(location)

    if group == False:
        a = "neighbourhood"
    elif group == True:
        a = "neighbourhood_group"

    # add datetime index if it doesn't exist
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_drop_datetime_index_in_reviews))
        print("datetime Index drop successfully or does not exist.")

    # drop neighbourhood index if it exists
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_drop_neigh_in_listings(group)))
        print("neigh Index drop successfully or does not exist.")

    print("in no index function", ensure_indexes())


    times = []
    for i in range(n):
        # create query
        start = time.time()

        # update query
        with db_eng.begin() as conn:
            conn.execute(sql_text(update_reviews_query(a, location))).fetchall()
        end = time.time()
        times.append(end - start)

        # undo query after time measurement
        with db_eng.begin() as conn:
            conn.execute(sql_text(undo_update(a, location))).fetchall()

    return {
            "avg": round(np.mean(times), 4),
            "min": round(min(times), 4),
            "max": round(max(times), 4),
            "std": round(np.std(times), 4),
            "count": len(times),
            "timestamp": datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    }

In [37]:
#combination 2: date index but no neigh index
def date_index(location):
    
    # check if location is in neighbourhood or neighbourhood_group
    group = check_group(location)

    if group == False:
        a = "neighbourhood"
    elif group == True:
        a = "neighbourhood_group"

    # add datetime index if it doesn't exist
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_create_datetime_index_in_reviews))
        print("date Index created successfully.")

    # drop neighbourhood index if it exists
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_drop_neigh_in_listings(group)))
        print("neigh Index drop successfully or does not exist.")

    print("in date index function", ensure_indexes())

    times = []
    for i in range(n):
        # create query
        start = time.time()

        # update query
        with db_eng.begin() as conn:
            conn.execute(sql_text(update_reviews_query(a, location))).fetchall()
        end = time.time()
        times.append(end - start)

        # undo query after time measurement
        with db_eng.begin() as conn:
            conn.execute(sql_text(undo_update(a, location))).fetchall()

    return {
            "avg": round(np.mean(times), 4),
            "min": round(min(times), 4),
            "max": round(max(times), 4),
            "std": round(np.std(times), 4),
            "count": len(times),
            "timestamp": datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    }

In [38]:
#combination 3: neigh index but no date index
def neigh_index(location):
    
    # check if location is in neighbourhood or neighbourhood_group
    group = check_group(location)

    if group == False:
        a = "neighbourhood"
    elif group == True:
        a = "neighbourhood_group"

    # add datetime index if it doesn't exist
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_drop_datetime_index_in_reviews))
        print("datetime Index drop successfully or does not exist.")

    # drop neighbourhood index if it exists
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_create_neigh_in_listings(group)))
        print("neigh Index created successfully.")

    print("in neigh index function", ensure_indexes())
    

    times = []
    for i in range(n):
        # create query
        start = time.time()

        # update query
        with db_eng.begin() as conn:
            conn.execute(sql_text(update_reviews_query(a, location))).fetchall()
        end = time.time()
        times.append(end - start)

        # undo query after time measurement
        with db_eng.begin() as conn:
            conn.execute(sql_text(undo_update(a, location))).fetchall()

    return {
            "avg": round(np.mean(times), 4),
            "min": round(min(times), 4),
            "max": round(max(times), 4),
            "std": round(np.std(times), 4),
            "count": len(times),
            "timestamp": datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    }

In [39]:
# combination 4: date index and neigh index

def date_neigh_index(location):
    
    # check if location is in neighbourhood or neighbourhood_group
    group = check_group(location)

    if group == False:
        a = "neighbourhood"
    elif group == True:
        a = "neighbourhood_group"

    # add index if it doesn't exist
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_create_datetime_index_in_reviews))
        print("date Index created successfully.")

    with db_eng.begin() as conn:
        conn.execute(sql_text(q_create_neigh_in_listings(group)))
        print("neigh Index created successfully.")

    print("in date neigh index function", ensure_indexes())


    times = []
    for i in range(n):
        # create query
        start = time.time()

        # update query
        with db_eng.begin() as conn:
            conn.execute(sql_text(update_reviews_query(a, location))).fetchall()
        end = time.time()
        times.append(end - start)

        # undo query after time measurement
        with db_eng.begin() as conn:
            conn.execute(sql_text(undo_update(a, location))).fetchall()

    return {
            "avg": round(np.mean(times), 4),
            "min": round(min(times), 4),
            "max": round(max(times), 4),
            "std": round(np.std(times), 4),
            "count": len(times),
            "timestamp": datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    }

In [40]:
# main function
# loop through all locations

for i in location:
    # write combination 1
    write_perf_data(i, tags[0], no_index(i))

    # write combination 2
    write_perf_data(i, tags[1], date_index(i))

    # write combination 3
    write_perf_data(i, tags[2], neigh_index(i))

    # write combination 4
    write_perf_data(i, tags[3], date_neigh_index(i))

print("all data written successfully")

datetime Index drop successfully or does not exist.
neigh Index drop successfully or does not exist.
in no index function datetime_in_reviews index does not exist.

neigh_in_listings index does not exist.

successfully wrote __ New Springville to test.json


date Index created successfully.
neigh Index drop successfully or does not exist.
in date index function datetime_in_reviews exists
neigh_in_listings index does not exist.

successfully wrote __datetime_in_reviews__ New Springville to test.json


datetime Index drop successfully or does not exist.
neigh Index created successfully.
in neigh index function datetime_in_reviews index does not exist.

neigh_in_listings exists
successfully wrote __neigh_in_listings__ New Springville to test.json


date Index created successfully.
neigh Index created successfully.
in date neigh index function datetime_in_reviews exists
neigh_in_listings exists
successfully wrote __datetime_in_reviews__neigh_in_listings__ New Springville to test.json


dat

KeyboardInterrupt: 