In [1]:
import sys
import json
import csv
import os

import pandas as pd
import numpy as np

# import matplotlib as mpl

# useful for printing dict and list objects
import pprint

import time
from datetime import datetime

import psycopg2
from sqlalchemy import create_engine, text as sql_text
import psycopg2.extras
from dotenv import load_dotenv


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
load_dotenv()

#get user input
db_username = os.environ['db_username']
db_password = os.environ['db_password']
                             
db_eng = create_engine('postgresql+psycopg2://' + db_username + ':' + db_password + '@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('public')},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

Successfully created db engine.


In [None]:
filename = 'text_search_query.json'

if os.path.exists(filename):
    try:
        # delete the file
        os.remove(filename)
        print(f"File '{filename}' successfully deleted.")
    except Exception as e:
        print(f"Error deleting file '{filename}': {e}")
else:
    print(f"File '{filename}' not found.")

#write perf data to file
def write_perf_data(new_data):
    
    #if file exists, read old data
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            try:
                old_data = json.load(f)  
            except json.JSONDecodeError:
                old_data = []  
    else:
        old_data = []

    # combine old and new data
    if isinstance(old_data, list):
        old_data.append(new_data)  
    else:
        old_data = [old_data, new_data]

    # write to file
    with open(filename, 'w') as f:
        json.dump(old_data, f, indent=4)

    print("wrote in file successfully")


File 'Step3b.json' successfully deleted.


In [4]:
# function to get the query for the year
def no_index_query(year, word):
    year_str = str(year)
    start_date = f"{year_str}-01-01"
    end_date = f"{year_str}-12-31"
    
    query = f"""
    SELECT count(*) 
    FROM reviews r 
    WHERE comments ILIKE '%{word}%'
      AND datetime >= '{start_date}'
      AND datetime <= '{end_date}';
    """
    
    return query

def index_query(year, word):
    year_str = str(year)
    start_date = f"{year_str}-01-01"
    end_date = f"{year_str}-12-31"
    
    query = f"""
    SELECT count(*) 
    FROM reviews r 
    WHERE comments_tsv @@ to_tsquery('{word}')
      AND datetime >= '{start_date}'
      AND datetime <= '{end_date}';
    """
    
    return query

#run time
n=3

### Using the index comments_tsv_in_reviews VS. Not using an index and doing searches in the comments field

### Using or not using the index datetime in reviews

The 4 index combinations are:

no indexes (at least, the queries will not be using any indexes);                         
using index on datetime only;                          
using index on comments (or more accurately, on comments_tsv) only;                          
using indexes on both datetime and comments(_tsv)              

In [5]:
def index_or_not(query_fun, year, word, n, has_index_or_not):
    query = query_fun(year, word)

    # run it 50 times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.begin() as conn:
            conn.execute(sql_text(query)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    result1 = {
        f'{word}_{year}': {
            has_index_or_not: {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
            }
        }
    }

    # write to json file
    write_perf_data(result1)

    print(f"Finshed: {has_index_or_not} for {word}_{year}.")

In [6]:
# using both indexes with @@
def using_both_index(year, word, n):
    q_create_datetime_index = '''
    CREATE INDEX IF NOT EXISTS datetime_in_reviews
    ON reviews(datetime);
    '''

    with db_eng.connect() as conn:
        conn.execute(sql_text(q_create_datetime_index))

    query = index_query(year, word) #with @@ query

    # run it n times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.begin() as conn:
            conn.execute(sql_text(query)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    result2 = {
        f'{word}_{year}': {
            'Index on both datetime_in_reviews and comments_tsv': {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
            }
        }
    }

    # write to json file
    write_perf_data(result2)

    print(f"Finshed: index on both for {word}_{year}.")

In [7]:
# using the index datetime_in_reviews only with 'ilike' query
def only_datetime_index(year, word, n):
    q_create_datetime_index = '''
    CREATE INDEX IF NOT EXISTS datetime_in_reviews
    ON reviews(datetime);
    '''
    
    with db_eng.connect() as conn:
        conn.execute(sql_text(q_create_datetime_index))

    query = no_index_query(year, word) #with 'ilike' query

    # run it n times
    times = []
    for i in range(n):
        start = time.time()
        with db_eng.begin() as conn:
            conn.execute(sql_text(query)).fetchall()
        end = time.time()
        times.append(end - start)

    #store results
    result3 = {
        f'{word}_{year}': {
            'Index datetime_in_reviews only': {
                'avg': round(sum(times) / len(times), 4),
                'min': round(min(times), 4),
                'max': round(max(times), 4),
                'std': round(np.std(times), 4),
                'count': len(times),
                'timestamp': datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
            }
        }
    }

    # write to json file
    write_perf_data(result3)

    print(f"Finshed: only datetime index for {word}_{year}.")

In [8]:
years = [2009, 2010, 2011, 2012, 2013, 2014, 2017, 2020, 2024]
words = ['horrible', 'awesome', 'apartment']

for y in years:
    for w in words:
        # use the index on comments_tsv only with @@
        index_or_not(index_query, y, w, n, 'Index on comments_tsv only')
        # no indexes with 'ilike'
        index_or_not(no_index_query, y, w, n, 'no_index')
        # using both indexes with @@
        using_both_index(y, w, n)
        # using the index datetime_in_reviews only with 'ilike' query
        only_datetime_index(y, w, n)

wrote in file successfully
Finshed: Index on comments_tsv only for horrible_2009.
wrote in file successfully
Finshed: no_index for horrible_2009.
wrote in file successfully
Finshed: index on both for horrible_2009.
wrote in file successfully
Finshed: only datetime index for horrible_2009.
wrote in file successfully
Finshed: Index on comments_tsv only for awesome_2009.
wrote in file successfully
Finshed: no_index for awesome_2009.
wrote in file successfully
Finshed: index on both for awesome_2009.
wrote in file successfully
Finshed: only datetime index for awesome_2009.
wrote in file successfully
Finshed: Index on comments_tsv only for apartment_2009.
wrote in file successfully
Finshed: no_index for apartment_2009.
wrote in file successfully
Finshed: index on both for apartment_2009.
wrote in file successfully
Finshed: only datetime index for apartment_2009.
wrote in file successfully
Finshed: Index on comments_tsv only for horrible_2010.
wrote in file successfully
Finshed: no_index for