In [1]:
import sys
import json
import csv
import os

import pandas as pd
import numpy as np

# import matplotlib as mpl

# useful for printing dict and list objects
import pprint

import time
from datetime import datetime

import psycopg2
from sqlalchemy import create_engine, text as sql_text
import psycopg2.extras
from dotenv import load_dotenv


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
load_dotenv()

#get user input
db_username = os.environ['db_username']
db_password = os.environ['db_password']
                             
db_eng = create_engine('postgresql+psycopg2://' + db_username + ':' + db_password + '@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format(os.environ['db_name'])},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

Successfully created db engine.


In [3]:
years = [2009, 2010, 2011, 2012, 2013, 2014, 2017, 2020, 2024]
words = ['horrible', 'awesome', 'apartment']
tags = ["__", "__comments_tsv_in_reviews__", "__datetime_in_reviews__", "__datetime_in_reviews__comments_tsv_in_reviews__"]

In [None]:
def ensure_datetime_index():
    check_index_sql = """
        SELECT 1 FROM pg_indexes 
        WHERE tablename = 'reviews' AND indexname = 'datetime_in_reviews';
    """
    with db_eng.connect() as conn:
        result = conn.execute(sql_text(check_index_sql)).fetchone()
        if not result:
            return("datetime_in_reviews inex does not exists.\n")
        else:
            return("datetime_in_reviews index exists.\n")

In [5]:
# json file to save the results

filename = 'text_search_query.json'

if os.path.exists(filename):
    try:
        # delete the file
        os.remove(filename)
        print(f"File '{filename}' successfully deleted.")
    except Exception as e:
        print(f"Error deleting file '{filename}': {e}")
else:
    print(f"File '{filename}' not found.")

#write perf data to file
def write_perf_data(word, year, tag, result, save_path= filename):
    """
    - word: words to seaerch for
    - year: year to search on
    - result: dict，stats from the query
    - save_path: JSON file path to save the results
    """
    loc_key = f"{word}_{year}"

    if os.path.exists(save_path):
        with open(save_path, "r") as f:
            existing = json.load(f)
    else:
        existing = {}

    if loc_key not in existing:
        existing[loc_key] = {}

    existing[loc_key][tag] = result

    with open(save_path, "w") as f:
        json.dump(existing, f, indent=4)

    print(f"successfully wrote {tag} {word} {year} to {save_path}")
    print("\n")


File 'text_search_query.json' successfully deleted.


In [None]:
# construct query for search on word index or not
def no_word_index_query(year, word):
    year_str = str(year)
    start_date = f"{year_str}-01-01"
    end_date = f"{year_str}-12-31"
    
    query = f"""
    SELECT count(*) 
    FROM reviews r 
    WHERE comments ILIKE '%{word}%'
      AND datetime >= '{start_date}'
      AND datetime <= '{end_date}';
    """
    
    return query

def word_index_query(year, word):
    year_str = str(year)
    start_date = f"{year_str}-01-01"
    end_date = f"{year_str}-12-31"
    
    query = f"""
    SELECT count(*) 
    FROM reviews r 
    WHERE comments_tsv @@ to_tsquery('{word}')
      AND datetime >= '{start_date}'
      AND datetime <= '{end_date}';
    """
    
    return query

q_create_datetime_index = '''
CREATE INDEX IF NOT EXISTS datetime_in_reviews
ON reviews(datetime);
'''

q_drop_datetime_index = '''
DROP INDEX IF EXISTS datetime_in_reviews;
'''

#run time
n=50

### Using the index comments_tsv_in_reviews VS. Not using an index and doing searches in the comments field

### Using or not using the index datetime in reviews

The 4 index combinations are:

no indexes (at least, the queries will not be using any indexes);                         
using index on datetime only;                          
using index on comments (or more accurately, on comments_tsv) only;                          
using indexes on both datetime and comments(_tsv)              

In [None]:
#use for:
# 1. no any index （query_fun = no_word_index_query）
# 2. only with tsv index (query_fun = word_index_query)
def no_word_index(query_fun, year, word):
    query = query_fun(year, word)

    with db_eng.connect() as conn:
        conn.execute(sql_text(q_drop_datetime_index))

    # run it 50 times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.connect() as conn:
            conn.execute(sql_text(query)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    return {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    }

In [None]:
# using both indexes with @@
def using_both_index(year, word):

    with db_eng.connect() as conn:
        conn.execute(sql_text(q_create_datetime_index))

    query = word_index_query(year, word) # use query with tsv index


    # run it n times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.connect() as conn:
            conn.execute(sql_text(query)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    return {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    }

In [None]:
# using the index datetime_in_reviews only with 'ilike' query
def only_datetime_index(year, word):
    
    with db_eng.connect() as conn:
        conn.execute(sql_text(q_create_datetime_index))

    query = no_word_index_query(year, word) #with 'ilike' query

    # run it n times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.connect() as conn:
            conn.execute(sql_text(query)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    return {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    }

In [10]:
for y in years:
    for w in words:
        # use the index on comments_tsv only with @@
        write_perf_data(w, y, tags[0], no_word_index(no_word_index_query, y, w))

        # no indexes with 'ilike'
        write_perf_data(w, y, tags[1], no_word_index(word_index_query, y, w))

        # using the index datetime_in_reviews only with 'ilike' query
        write_perf_data(w, y, tags[2],  only_datetime_index(y, w))

        #6 using both indexes with @@
        write_perf_data(w, y, tags[3], using_both_index(y, w))
        


no word index expected datetime_in_reviews inex does not exists.

successfully wrote __ horrible 2009 to text_search_query.json


no word index expected datetime_in_reviews inex does not exists.

successfully wrote __comments_tsv_in_reviews__ horrible 2009 to text_search_query.json


in only date time index datetime_in_reviews index exists.

successfully wrote __datetime_in_reviews__ horrible 2009 to text_search_query.json


in using both index datetime_in_reviews index exists.

successfully wrote __datetime_in_reviews__comments_tsv_in_reviews__ horrible 2009 to text_search_query.json


no word index expected datetime_in_reviews inex does not exists.

successfully wrote __ awesome 2009 to text_search_query.json


no word index expected datetime_in_reviews inex does not exists.

successfully wrote __comments_tsv_in_reviews__ awesome 2009 to text_search_query.json


in only date time index datetime_in_reviews index exists.

successfully wrote __datetime_in_reviews__ awesome 2009 to text_