In [71]:
import sys
import json
import csv
import os

import pandas as pd
import numpy as np

# import matplotlib as mpl

# useful for printing dict and list objects
import pprint

import time
from datetime import datetime

import psycopg2
from sqlalchemy import create_engine, text as sql_text
import psycopg2.extras
from dotenv import load_dotenv



In [72]:
load_dotenv()

#get user input
db_username = os.environ['db_username']
db_password = os.environ['db_password']
                             
db_eng = create_engine('postgresql+psycopg2://' + db_username + ':' + db_password + '@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('airbnb')},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")


Successfully created db engine.


In [None]:
filename = 'listings_join_reviews.json'

if os.path.exists(filename):
    try:
        # delete the file
        os.remove(filename)
        print(f"File '{filename}' successfully deleted.")
    except Exception as e:
        print(f"Error deleting file '{filename}': {e}")
else:
    print(f"File '{filename}' not found.")

#write perf data to file
def write_perf_data(new_data):
    
    #if file exists, read old data
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            try:
                old_data = json.load(f)  
            except json.JSONDecodeError:
                old_data = []  
    else:
        old_data = []

    # combine old and new data
    if isinstance(old_data, list):
        old_data.append(new_data)  
    else:
        old_data = [old_data, new_data]

    # write to file
    with open(filename, 'w') as f:
        json.dump(old_data, f, indent=4)

    print("wrote in file successfully")


File 'Step3a.json' not found.


In [74]:
# function to get the query for the year
def yyyy_query(year):
    year_str = str(year)
    start_date = f"{year_str}-01-01"
    end_date = f"{year_str}-12-31"

    query = f"""
    SELECT * 
    FROM listings l, reviews r 
    WHERE l.id = r.listing_id
      AND r.datetime >= '{start_date}'
      AND r.datetime <= '{end_date}'
    ORDER BY l.id;
    """
    
    return query


#run time
n=50

#check indexes
q_show_indexes = '''
select *
from pg_indexes
WHERE tablename IN ('reviews', 'listings');
'''

### combination 1
### no indexes

In [75]:
#Add index on `reviews.datetime`
q_drop_datetime_index_in_reviews = '''
DROP INDEX IF EXISTS datetime_in_reviews;
'''

#Remove index on `listings.id`
q_drop_id_in_listings = '''
DROP INDEX IF EXISTS id_in_listings;
'''

In [76]:
def no_index(q, year):

    # delete indexes if they exist
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_drop_datetime_index_in_reviews))
        conn.execute(sql_text(q_drop_id_in_listings))
        result_reviews = conn.execute(sql_text(q_show_indexes))
        print()
        print('The set of indexes on reviews and listings is: ')
        print(result_reviews.all())


    # run it 50 times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.begin() as conn:
            conn.execute(sql_text(q)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    result1 = {
        f'listings_join_reviews_{year}': {
            '__': {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
            }
        }
    }

    # write to json file
    write_perf_data(result1)

    print(f"Finshed the first combination, no index for year {year}.")

### combination 2
### datetime_in_reviews index exists

In [77]:
#create index on reviews.datetime
q_create_datetime_index_in_reviews = '''
CREATE INDEX IF NOT EXISTS datetime_in_reviews
ON reviews(datetime);
'''

#delete the index on listings.id if it exists
q_drop_id_in_listings = '''
DROP INDEX IF EXISTS id_in_listings;
'''

In [None]:
def index_datetime_in_reviews(q, year):

    #execute the queries to create the index and drop the id_in_listings index and check
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_create_datetime_index_in_reviews))
        conn.execute(sql_text(q_drop_id_in_listings))
        result_reviews = conn.execute(sql_text(q_show_indexes))
        print()
        print('The set of indexes on reviews and listings is: ')
        print(result_reviews.all())


    # run it 50 times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.begin() as conn:
            conn.execute(sql_text(q)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    result2 = {
            '__datetime_in_reviews__': {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
            }
        }

    # write to json file
    write_perf_data(result2)

    print(f"Finshed the second combination, with index datetime_in_reviews for year {year}.")

### combination 3
### id_in_listings exists

In [79]:
#create index on listings.id
q_create_id_in_listings = '''
CREATE INDEX IF NOT EXISTS id_in_listings
ON listings(id);
'''

#delete the index on reviews.datetime if it exists
q_drop_datetime_in_reviews = '''
DROP INDEX IF EXISTS datetime_in_reviews;
'''


In [None]:
def index_id_in_listings(q, year):

    #execute the queries
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_create_id_in_listings))
        conn.execute(sql_text(q_drop_datetime_in_reviews))
        result_reviews = conn.execute(sql_text(q_show_indexes))
        print()
        print('The set of indexes on reviews and listings is: ')
        print(result_reviews.all())


    # run it 50 times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.begin() as conn:
            conn.execute(sql_text(q)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    result3 = {
            '__id_in_listings__': {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
            }
        }

    # write to json file
    write_perf_data(result3)

    print(f"Finshed the third combination, with index id_in_listings for year {year}.")

### combination 4
### id_in_listings and datetime_in_reviews exists

In [81]:
#create index on listings.id
q_create_id_in_listings = '''
CREATE INDEX IF NOT EXISTS id_in_listings
ON listings(id);
'''
#create index on reviews.datetime
q_create_datetime_index_in_reviews = '''
CREATE INDEX IF NOT EXISTS datetime_in_reviews
ON reviews(datetime);
'''

In [None]:
def index_both(q, year):


    #execute the queries
    with db_eng.begin() as conn:
        conn.execute(sql_text(q_create_id_in_listings))
        conn.execute(sql_text(q_create_datetime_index_in_reviews))
        result_reviews = conn.execute(sql_text(q_show_indexes))
        print()
        print('The set of indexes on reviews and listings is: ')
        print(result_reviews.all())

    # run it 50 times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.begin() as conn:
            conn.execute(sql_text(q)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    result4 = {
            '__datetime_in_reviews__id_in_listings__': {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
            }
        }

    # write to json file
    write_perf_data(result4)

    print(f"Finshed the fourth combination, with index id_in_listings and datetime_in_reviews for year {year}.")


In [83]:
for i in range(2009, 2026):
    year = i
    print(f"Year: {year}")

    # get the query for the year
    q = yyyy_query(year)

    # run the queries with no index
    no_index(q, year)

    # run the queries with index on reviews.datetime
    index_datetime_in_reviews(q, year)

    # run the queries with index on listings.id
    index_id_in_listings(q, year)

    # run the queries with both indexes
    index_both(q, year)

    print("Finished all combinations for year {year}.")


Year: 2009

The set of indexes on reviews and listings is: 
[]
wrote in file successfully
Finshed the first combination, no index for year 2009.

The set of indexes on reviews and listings is: 
[('airbnb', 'reviews', 'datetime_in_reviews', None, 'CREATE INDEX datetime_in_reviews ON airbnb.reviews USING btree (datetime)')]
wrote in file successfully
Finshed the second combination, with index datetime_in_reviews for year 2009.

The set of indexes on reviews and listings is: 
[('airbnb', 'listings', 'id_in_listings', None, 'CREATE INDEX id_in_listings ON airbnb.listings USING btree (id)')]
wrote in file successfully
Finshed the third combination, with index id_in_listings for year 2009.

The set of indexes on reviews and listings is: 
[('airbnb', 'listings', 'id_in_listings', None, 'CREATE INDEX id_in_listings ON airbnb.listings USING btree (id)'), ('airbnb', 'reviews', 'datetime_in_reviews', None, 'CREATE INDEX datetime_in_reviews ON airbnb.reviews USING btree (datetime)')]
wrote in file