## <span style=color:blue>Patterns used in Programming Assignment 2 (version mostly avoiding the util.py file)  </span>

In [1]:
# These are boiler plate imports that seem useful
# Perhaps cleaner would be to delete or comment out the ones that aren't used in this script...

import sys
import json
import csv
import yaml

import pandas as pd
import numpy as np

import datetime;

import matplotlib as mpl

import time
from datetime import datetime
# see https://stackoverflow.com/questions/415511/how-do-i-get-the-current-time-in-python
#   for some basics about datetime

import pprint

import os
from dotenv import load_dotenv

# sqlalchemy 2.0 documentation: https://www.sqlalchemy.org/
import psycopg2
from sqlalchemy import create_engine, text as sql_text

# the following is deprecated, it seems, so using the sqlalchemy
# from pyscopg2 import sqlio

# the file in benchmarking/util.py should hold utilities useful for your benchmarking exercise
# In this notebook we have commented out all mentions of util, so that you can run
#    this notebook before setting up your benchmarking/util.py file
# sys.path.append('benchmarking/')
# import util
# to invoke a function "foo()" inside util.py, use "util.foo()"

In [2]:
# test that utils.py has been imported well

In [3]:
dotenv_path = 'variables.env'
load_dotenv(dotenv_path=dotenv_path)

True

In [4]:

load_dotenv()

schema = os.getenv('DISC_4_SCHEMA')
port = os.getenv('DISC_4_PORT')
host = os.getenv('DISC_4_HOST')
database = os.getenv('DISC_4_DB')
username = os.getenv('username')
password = os.getenv('password')

### <span style=color:blue>For this exercise you will use four .csv files from AirBnB.</span>

<span style=color:blue>You can find the files at https://drive.google.com/drive/folders/14gWh0ck3vzWxyakaWHHH38AgWY7UC-IQ?usp=sharing </span> 

### <span style=color:blue>Setting up Postgres connection.  Note database name is "airbnb" </span>

### <span style=color:blue>Note: this should be modified so that the user name/password are not included into the program. </span>

<span style=color:blue>E.g., see https://docs.sqlalchemy.org/en/20/core/engines.html for how to construct the URLs that the create_engine command uses.  Also, one should store the user/password into environment variables and read them in to populate the URL.  </span>

<span style=color:blue>E.g., see https://stackoverflow.com/questions/4906977/how-can-i-access-environment-variables-in-python for how to work with environment variables on mac, </span>

In [5]:
# following https://www.geeksforgeeks.org/connecting-postgresql-with-sqlalchemy-in-python/

db_eng = create_engine(f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}",
                       connect_args={'options': '-csearch_path={}'.format(f"{schema}")},
                       isolation_level = 'SERIALIZABLE')
#    , echo=True)
#    , echo_pool="debug")

print("Successfully created db engine.")

# connect_args is used to set search_path to the schema 'new_york_city' in the airbnb database

# isolation_level SERIALIZABLE makes transactions happen in sequence, which is good 
#      for the benchmarking we will be doing

# for general info on sqlalchemy connections,
#    see: https://docs.sqlalchemy.org/en/20/core/connections.html

# echo from https://docs.sqlalchemy.org/en/20/core/engines.html

Successfully created db engine.


### <span style=color:blue>Here is a pattern for using db_eng for queries</span>

In [6]:
q1 = """ 
SELECT *
FROM reviews 
WHERE date >= '2015-01-01' 
  AND date <= '2015-12-31' 
"""
q2 = """ 
SELECT count(*)
FROM reviews 
WHERE date >= '2015-01-01' 
  AND date <= '2015-12-31' 
"""

# You can use conn.execute, which populates a cursor, in this case "result1" or "result2"
# Alternatively, you can use pd.read_sql, which populates a dataframe
with db_eng.connect() as conn:
    result1 = conn.execute(sql_text(q1))   # sql_text was part of import from psycopg2
    df1 = pd.read_sql(q1, con=conn)
    
    result2 = conn.execute(sql_text(q2))
    df2 = pd.read_sql(q2, con=conn)
    # conn.close() is automatically added to the end of this block

print()
print(type(result1))
print()
print(type(df1))
print()
pprint.pp(result1.fetchmany(3), width=120)
print()
pprint.pp(df1.head(3))
print()
print(result2.all())            # result is small, so can fetch all of it
print()
pprint.pp(df2.head(10))


<class 'sqlalchemy.engine.cursor.CursorResult'>

<class 'pandas.core.frame.DataFrame'>

[('705749', '24678827', datetime.date(2015, 1, 1), '4107079', 'Shu & Maki-San', 'Very friendly and always willing to help us when needed! Would come back again!', datetime.datetime(2015, 1, 1, 12, 0)),
 ('824421', '24668863', datetime.date(2015, 1, 1), '5548596', 'Ezer', 'Dina was great! She was very communicative, and her place was very nice and well kept. ', datetime.datetime(2015, 1, 1, 12, 0)),
 ('689329', '24668961', datetime.date(2015, 1, 1), '22591652', 'Carina', 'Biren and his family are amazing hosts. They explained all about the city and made great suggestions of tours. The room is great, very comfortable and clean like all the house. The neighborhood is very quiet and safe. I highly recommend it! ', datetime.datetime(2015, 1, 1, 12, 0))]

  listing_id        id        date reviewer_id   reviewer_name  \
0     705749  24678827  2015-01-01     4107079  Shu & Maki-San   
1     824421  24668

### <span style=color:blue>Example of pattern for creating parameterized functions for creating (parameterized) queries</span>

<span style=color:blue>As part of Programming Assignment 2, you will create several query building functions,
and put them into your utils.py file</span>

In [7]:
def build_query_reviews_count(date1, date2):
    q21 = """
SELECT count(*)
FROM reviews
WHERE date >= '"""
    q22 = """'
  AND date <= '"""
    q23 = """';
"""
    return q21 + date1 + q22 + date2 + q23

print(build_query_reviews_count('2015-01-01', '2015-12-31'))


SELECT count(*)
FROM reviews
WHERE date >= '2015-01-01'
  AND date <= '2015-12-31';



In [8]:
sys.path.append('/Users/Nfaith21/Downloads/DISC_5_FILES/benchmarking/')
import util_main as util

In [9]:
util.hello_world()

Hello World!


<span style=color:blue>We now show a query that will be used below to illustrated various things. You should build a function, perhaps called "build_query_listings_join_reviews" that takes two parameters for start date and end date, that can build this kind of query. </span> 

In [10]:
q_listings_join_reviews_2015 = """
SELECT DISTINCT l.id, l.name
FROM listings l, reviews r 
WHERE l.id = r.listing_id
  AND r.date >= '2015-01-01'
  AND r.date <= '2015-12-31'
ORDER BY l.id;
"""

# The following code would work if you have the function build_query_listings_join_reviews()
#    defined in your util.py file



q_dict = {}

q_dict['listings_join_reviews_2013'] = util.build_query_listings_join_reviews('2013-01-01', '2013-12-31')
# note: The reviews table has 7,317 entries in 2013

#q_dict['listings_join_reviews_2015'] = util.build_query_listings_join_reviews('2015-01-01', '2015-12-31')
# note: The reviews table has 28,465 entries in 2015

#q_dict['listings_join_reviews_2019'] = util.build_query_listings_join_reviews('2019-01-01', '2019-12-31')
# note: The reviews table has 126,469 entries in 2019

q_dict['listings_join_reviews_2023'] = util.build_query_listings_join_reviews('2023-01-01', '2023-12-31')
# note: The reviews table has 228,831 entries in 2023

print(q_dict['listings_join_reviews_2013'])
print()
#print(q_dict['listings_join_reviews_2015'])
#print()
#print(q_dict['listings_join_reviews_2019'])
#print()
print(q_dict['listings_join_reviews_2023'])

print()

SELECT DISTINCT l.id, l.name
FROM listings l, reviews r 
WHERE l.id = r.listing_id
  AND r.date >= '2013-01-01'
  AND r.date <= '2013-12-31'
ORDER BY l.id;

SELECT DISTINCT l.id, l.name
FROM listings l, reviews r 
WHERE l.id = r.listing_id
  AND r.date >= '2023-01-01'
  AND r.date <= '2023-12-31'
ORDER BY l.id;



### <span style=color:blue>Here is a pattern for computing the run-time of something, e.g., a query or an update.</span>

<span style=color:blue>You should also put this into your util.py file.</span>

In [11]:
def time_diff(time1, time2):
    return (time2-time1).total_seconds()

# testing it:
time1 = datetime.now()
# put query or update code in place of sleep command
time.sleep(0.5)
time2 = datetime.now()

print(time_diff(time1,time2))
    

0.505682


### <span style=color:blue>Here is an example of running a query multiple times, and keeping track of run times</span>

<span style=color:blue>As part of Programming Assignment 2, you should create a general-purpose function for doing this,
and put it into your utils.py file<span>

<span style=color:blue>In the illustration below we read the output of the query into a dataframe, which ensures that the entire output is computed and exported by PostgreSQL.  If we read the output into a cursor, then PostgreSQL might use a "lazy" approach, and not compute the full query output until we scroll through the cursor. </span>

In [12]:
# we will use the query q_listings_join_reviews_2015 defined above

count = 20

time_list = []
for i in range(0,count):
    time_start = datetime.now()
    # Open new db connection for each execution of the query to avoid multithreading
    with db_eng.connect() as conn:
        df = pd.read_sql(q_listings_join_reviews_2015, con=conn)

    time_end = datetime.now()
    diff = time_diff(time_start, time_end)
    time_list.append(diff)

pprint.pp(time_list)
print(round(sum(time_list)/len(time_list), 4), \
        round(min(time_list), 4), \
        round(max(time_list), 4), \
        round(np.std(time_list), 4))

[0.075127,
 0.09525,
 0.035836,
 0.037773,
 0.027877,
 0.028924,
 0.028112,
 0.030384,
 0.030233,
 0.029305,
 0.029793,
 0.031203,
 0.031586,
 0.030773,
 0.032865,
 0.030598,
 0.046304,
 0.032027,
 0.030279,
 0.030729]
0.0372 0.0279 0.0953 0.0168


### <span style=color:blue>Here is a pattern for adding/dropping indexes. </span>

<span style=color:blue>As part of programming exercise 2 you should create a general-purpose parameterized function that can be used to add or drop an index with a given name, focused on a given table, and on a given column of that table.  After testing that the function behaves as you expect it then you should put that function into the file utils.py. </span>

<span style=color:blue>For this function, I used the name add_drop_index() with four arguments:  db_eng, add/drop, column to index, table.  I assume a systematic naming of the indexes, having the form <col-name>_in_<table_name></span>

<span style=color:blue>(The "show_indexes" queries are mainly for testing that the add/drop index functions are working correctly.)<span>

In [13]:
q_create_date_in_reviews = '''
BEGIN TRANSACTION;
CREATE INDEX IF NOT EXISTS date_in_reviews
ON reviews(date);
END TRANSACTION;
'''

q_drop_date_in_reviews = '''
BEGIN TRANSACTION;
DROP INDEX IF EXISTS date_in_reviews;
END TRANSACTION;
'''

q_show_indexes_for_reviews = '''
select *
from pg_indexes
where tablename = 'reviews';
'''

# by using a code block, it ensures that after completion 
#     the change to the indexes will be committed in the database
with db_eng.connect() as conn:
    conn.execute(sql_text(q_create_date_in_reviews))
    #conn.execute(sql_text(q_drop_date_in_reviews))
    result_reviews = conn.execute(sql_text(q_show_indexes_for_reviews))
    print()
    print('The set of indexes on reviews is: ')
    print(result_reviews.all())



The set of indexes on reviews is: 
[('houses', 'reviews', 'date_in_reviews', None, 'CREATE INDEX date_in_reviews ON houses.reviews USING btree (date)'), ('houses', 'reviews', 'datetime_in_reviews', None, 'CREATE INDEX datetime_in_reviews ON houses.reviews USING btree (datetime)')]


### <span style=color:blue>Now there is an index on the date column of reviews.  Rerun the preceding cell to see if the performance on the query q_listings_join_reviews_2015 has changed </span>

### <span style=color:blue>The performance results will be held in a file 'perf_data/perf_summary.json' in your base directory. The format of this json file is described here. </span>

<span style=color:blue> Also, this cell shows functions for fetching the previous performance data (stored as json in  "perf_data/perf_summary.json"), and then writing it out again (after you have adding in more data).  This will allow you to run numerous tests at different times, but keep all of the results in one place.</span>


In [14]:
# the key for each entry of perf_dict will be the name of a query or update
# the value for each entry of perf_dict will be a "perf_dict" with keys that 
#     list all indexes that were in force at the time of the test run.  E.g.:
# 
#        { '__' : ...,                                     -- i.e., no indexes in force
#          '__id_in_listings__' : ...,                     -- indexes in force: { id_in_listings }  
#          '__date_in_reviews__' : ...,                    -- indexes in force: { date_in_reviews }
#          '__date_in_reviews__id_in_listings__' : ... }   -- indexes in force: { date_in_reviews, id_in_listings }

# the value for each entry of the inner dict will have be a "performance profile" (perf_prof):
#       having shape {avg: ..., min: ..., max: ..., std: ...}
# (please see below for an example)


# fetches filename (which should be a json file) and returns a 
#       dict corresponding to the contents of filename
def fetch_perf_data(filename):
    f = open('/Users/Nfaith21/Documents/ECS 116 - Misc/DISC_5_FILES/perf_data' + filename)
    return json.load(f)

# writes the dictionary in dict as a json file into filename
def write_perf_data(dict, filename):
    with open('/Users/Nfaith21/Documents/ECS 116 - Misc/DISC_5_FILES/perf_data' + filename, 'w') as fp:
        json.dump(dict, fp)

# testing:
test = { 'foo': 'goo', 'foo1' : {'hoo': 'boo', 'zoo': 'loo'}}
write_perf_data(test, 'test.json')
dict = fetch_perf_data('test.json')
pprint.pp(dict, indent=4)

{'foo': 'goo', 'foo1': {'hoo': 'boo', 'zoo': 'loo'}}


<span style=color:blue>Run the next code once to initialize the file 'perf_data/perf_summary.json'; then comment it out!</span>

In [15]:
# initialize the performance data perf_summary.json file to {}
write_perf_data({}, 'perf_summary.json')

# sanity check
perf_summary = fetch_perf_data('perf_summary.json')
pprint.pp(perf_summary, indent=4)

{}


### <span style=color:blue>Here is an illustration of how you can perform one test (with specified indexes) on one query</span>

#### <span style=color:blue>CAUTION: the next cell is using two functions that I have set up in my benchmarking/util.py file, so it will not run for you until you set up these functions.  </span>

<span style=color:blue>As part of the progamming exercise, you should create one or more parameterized functions that will enable you to invoke this kind of test numerous times, on a selected query/update and a set of selected indexes.

<span style=color:blue>To provide a small illustration of the family of performance values that you will be obtaining I have run the following cell four times on the same query, but using different combinations of indexes.  Can you explain why there are different running times for different combinations of indexes?  Also, do you get roughly the same numbers as I do -- why or why not?  Do you get the same numbers if you run the test for a given set of indexes twice -- why or why not?</span>


In [16]:
'''
# the variable all_indexes will hold all of the indexes involved in your testing.
#   For now there are 3 indexes, but there will be more.  set of all indexes will get bigger once we do more explorations
# Here, a pair ['col','table'] refers to an index on column 'col' in table 'table'
# (in an ideal world, we would keep a copy of this on disk, probably in your computer's file system,
#   and read it in when we want to use it and/or add to it.  For the full Programming Assignment 2
#   we will be working with 4 to 6 indexes)

all_indexes = [['date','reviews'], ['date','calendar'], ['id','listings']] 


# pull in performance summary from previous tests done
perf_summary = fetch_perf_data('perf_summary.json')

# we will use the same query as above, and call it 'listings_join_reviews_2015'
#   in perf_summary.json, info about different runs for this query are
#   held in perf_summary[<<query_name>>]

# q = q_dict[query_name]
q_listings_join_reviews_2015 = """
SELECT DISTINCT l.id, l.name
FROM listings l, reviews r 
WHERE l.id = r.listing_id
  AND r.date >= '2015-01-01'
  AND r.date <= '2015-12-31'
ORDER BY l.id;
"""

query_name = 'listings_join_reviews_2015'


# here the spec is a listing of column-table pairs corresponding to indexes that are
#    to be included in the test
# I have run this jupyter cell on the 4 specs listed below
spec = [['id','listings'], ['date','reviews']]
# spec = [['date','reviews']]
# spec = [['id','listings']]
# spec = []

# count will hold the number of times we want to run the query
count = 50

print('Processing spec: ', str(spec), '\n')
for index in all_indexes:
    if index not in spec:
        mod_index = util.add_drop_index(db_eng, 'drop', index[0], index[1])
        print('\nAfter doing the drop for', str(index), 'the indexes on table "' + index[1] + '" are: ')
        print(mod_index)
        
for index in spec:
    mod_index = util.add_drop_index(db_eng, 'add', index[0], index[1])
    print('\nAfter doing the add for', str(index), 'the indexes on table "' + index[1] + '" are: ')
    print(mod_index)

time_list = []
for i in range(0,count):
    time_start = datetime.now()
    # Open new db connection for each execution of the query to avoid multithreading
    with db_eng.connect() as conn:
        df = pd.read_sql(q_listings_join_reviews_2015, con=conn)
    time_end = datetime.now()
    diff = time_diff(time_start, time_end)
    time_list.append(diff)
    
perf_profile = {}
perf_profile['avg'] = round(sum(time_list)/len(time_list), 4)
perf_profile['min'] = round(min(time_list), 4)
perf_profile['max'] = round(max(time_list), 4)
perf_profile['std'] = round(np.std(time_list), 4)

print('\nThe list of running times is as follows:')
pprint.pp(time_list)

print('\nThe statistics on the list of running times are as follows:')
pprint.pp(perf_profile)

# util.build_index_description_key() creates a listing of strings corresponding
#    to the entries in spec, and concatenates them in the ordering given by all_indexes
#    For example, the description_key associated with having indexes date_in_reviews and id_in_listings
#        would be __date_in_reviews__id_in_listings__'
#        (You probably want to use a uniform ordering of index names when you create these description_keys
key_value = util.build_index_description_key(all_indexes, spec)
print('\nThe new value for"' + key_value + '"will be', str(perf_profile))


# we may have run some other tests with the query q_listings_join_reviews_2015' and
#   we don't want to overwrite those.  So we need to get the full contents
#   of perf_summary['listings_join_reviews_2015'] and then
#   write (or overwrite) the value for the current list of indexes used

if query_name in perf_summary:
    perf_dict = perf_summary[query_name]
    print("\nBefore modifying perf_dict, the value of perf_summary[query_name] (if it existed) was: ")
    pprint.pp(perf_dict)
else:
    perf_dict = {}
    print("\nBefore modifying perf_dict, the value of perf_summary[query_name] had empty value")
print()
perf_dict[key_value] = perf_profile
perf_summary['listings_join_reviews_2015'] = perf_dict

print("\nAfter modifying perf_dict, the value of perf_summary[query_name] is: ")
pprint.pp(perf_summary[query_name])
print()

print('\nThe full value of perf_summary is:')
pprint.pp(perf_summary)

write_perf_data(perf_summary, 'perf_summary.json')'''


'\n# the variable all_indexes will hold all of the indexes involved in your testing.\n#   For now there are 3 indexes, but there will be more.  set of all indexes will get bigger once we do more explorations\n# Here, a pair [\'col\',\'table\'] refers to an index on column \'col\' in table \'table\'\n# (in an ideal world, we would keep a copy of this on disk, probably in your computer\'s file system,\n#   and read it in when we want to use it and/or add to it.  For the full Programming Assignment 2\n#   we will be working with 4 to 6 indexes)\n\nall_indexes = [[\'date\',\'reviews\'], [\'date\',\'calendar\'], [\'id\',\'listings\']] \n\n\n# pull in performance summary from previous tests done\nperf_summary = fetch_perf_data(\'perf_summary.json\')\n\n# we will use the same query as above, and call it \'listings_join_reviews_2015\'\n#   in perf_summary.json, info about different runs for this query are\n#   held in perf_summary[<<query_name>>]\n\n# q = q_dict[query_name]\nq_listings_join_re

In [17]:
util.calc_time_diff_per_year(db_eng, count, q_dict)


The list of running times for listings_join_reviews_2013 is as follows:
[0.130186,
 0.013989,
 0.013155,
 0.012604,
 0.01284,
 0.02474,
 0.011884,
 0.01343,
 0.013153,
 0.013115,
 0.01381,
 0.012003,
 0.013118,
 0.0123,
 0.011675,
 0.012756,
 0.012766,
 0.012174,
 0.012304,
 0.012304]

The statistics on the list of running times for listings_join_reviews_2013 are as follows:
{'avg': 0.0192,
 'exc_count': 20,
 'max': 0.1302,
 'min': 0.0117,
 'std': 0.0256,
 'timestamp': '2024-05-20-16:19:06'}

The list of running times for listings_join_reviews_2023 is as follows:
[1.458297,
 1.386197,
 0.683644,
 0.159354,
 0.153453,
 0.190002,
 0.166119,
 0.155634,
 0.193969,
 0.179732,
 0.15281,
 0.162598,
 0.21688,
 0.179939,
 0.154656,
 0.163339,
 0.202452,
 0.523321,
 0.494491,
 0.160387]

The statistics on the list of running times for listings_join_reviews_2023 are as follows:
{'avg': 0.3569,
 'exc_count': 20,
 'max': 1.4583,
 'min': 0.1528,
 'std': 0.3834,
 'timestamp': '2024-05-20-16:19:13'}


{'listings_join_reviews_2013': {'avg': 0.0192,
  'min': 0.0117,
  'max': 0.1302,
  'std': 0.0256,
  'exc_count': 20,
  'timestamp': '2024-05-20-16:19:06'},
 'listings_join_reviews_2023': {'avg': 0.3569,
  'min': 0.1528,
  'max': 1.4583,
  'std': 0.3834,
  'exc_count': 20,
  'timestamp': '2024-05-20-16:19:13'}}

In [18]:
new_dict = {}

years = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
        '2020', '2021', '2022', '2023', '2024']
for year in years:
    new_dict['q_listings_join_reviews_'+year] = util.build_query_listings_join_reviews_datetime(year+'-01-01 12:00:00.000', year+'-12-31 12:00:00.000')


In [19]:
# Initialize an empty dictionary to store performance details for all queries
all_perf_details = {}

all_indexes = [['datetime','reviews'], ['id','listings']] 


# Define index combinations to test in the specified order
index_combinations = [
    [],
    [['datetime', 'reviews']],
    [['id', 'listings']],
    [['datetime', 'reviews'], ['id', 'listings']]
]

count = 50

# Iterate through each query and its associated data
for query_name, query in new_dict.items():
    print('Processing query:', query_name)

    # Initialize a dictionary to store performance details for the current query
    query_perf_details = {}

    # Iterate through each index combination in the specified order
    for spec in index_combinations:
        # Get the current index combination name using build_index_description_key function
        spec_name = util.build_index_description_key(all_indexes, spec) if spec else '__'

        print('Processing spec:', spec_name)

        # Drop indexes not in the current spec
        for index in all_indexes:
            if index not in spec:
                mod_index = util.add_drop_index(db_eng, 'drop', index[0], index[1])
                print(f'\nAfter dropping {index}, the indexes are:')
                print(mod_index)

        # Add indexes in the current spec
        for index in spec:
            mod_index = util.add_drop_index(db_eng, 'add', index[0], index[1])
            print(f'\nAfter adding {index}, the indexes are:')
            print(mod_index)

        # Calculate performance metrics for the current query and spec
        perf_details = util.calc_time_diff_per_year(db_eng, count, {query_name: query})

        # Add the performance details to the query's performance dictionary under the spec_name key
        query_perf_details[spec_name] = perf_details[query_name]

    # Add the query's performance details to the overall performance dictionary
    all_perf_details[query_name] = query_perf_details

# Write all performance data to JSON file
util.write_perf_data(all_perf_details, 'listings_join_reviews.json')


Processing query: q_listings_join_reviews_2009
Processing spec: __

After dropping ['datetime', 'reviews'], the indexes are:
[('houses', 'reviews', 'date_in_reviews', None, 'CREATE INDEX date_in_reviews ON houses.reviews USING btree (date)')]

After dropping ['id', 'listings'], the indexes are:
[]

The list of running times for q_listings_join_reviews_2009 is as follows:
[1.046846,
 0.864968,
 1.139674,
 0.6615,
 0.590785,
 0.21916,
 0.084402,
 0.094895,
 0.080857,
 0.116346,
 0.249974,
 0.134316,
 0.092445,
 0.102037,
 0.081056,
 0.084886,
 0.0939,
 0.160122,
 0.097593,
 0.080312,
 0.080818,
 0.080919,
 0.096449,
 0.084191,
 0.087434,
 0.082427,
 0.094932,
 0.094696,
 0.081171,
 0.089499,
 0.081507,
 0.094882,
 0.147733,
 0.082341,
 0.146845,
 0.180373,
 0.08607,
 0.0811,
 0.09048,
 0.080413,
 0.091302,
 0.082989,
 0.083932,
 0.078949,
 0.081469,
 0.080924,
 0.089144,
 0.083799,
 0.088143,
 0.080051]

The statistics on the list of running times for q_listings_join_reviews_2009 are as 

In [20]:
  q5 = """
SELECT CAST(EXTRACT(YEAR FROM datetime) AS INTEGER) AS year, COUNT(*) AS review_count
FROM reviews
group by year;
"""
with db_eng.connect() as conn:
    result_reviews = conn.execute(sql_text(q5))
    print()
    print(result_reviews.all())



[(2010, 449), (2024, 8710), (2023, 228831), (2016, 48527), (2009, 56), (2011, 1905), (2017, 66146), (2020, 51172), (2021, 109415), (2022, 196136), (2015, 28465), (2012, 3872), (2018, 95137), (2014, 14203), (2019, 126469), (2013, 7317)]
