In [None]:
# Alla Topp
# Multi-processing with PostgreSQL 
# Applying multi-processing with each query by simulating 1,10,20,50,100 users as processes
# to identify virtual machines` performance based on specifications like RAM, CPU and hard drive

In [1]:
#!pip install psycopg2

In [2]:
#!pip install pygresql

In [1]:
import psycopg2
import sqlalchemy
import matplotlib as plt
import concurrent.futures
import multiprocessing

In [2]:
# CONNECTING TO DATABASE yelpdb in postgreSQL
#conn = psycopg2.connect(host = "192.168.200.194", port = "5432", user = "postgres", database = "yelpdb")

In [3]:
def doCountQuery(query):    
    conn = psycopg2.connect(host = "192.168.200.194", port = "5432", user = "postgres",
                                                                    database = "yelpdb")
    cur = conn.cursor()
    cur.execute(query)
    query_results = cur.fetchall()
    
#    print(query_results)
    conn.close()

In [44]:
#doCountQuery("""SELECT COUNT(*) FROM review;""")

In [4]:
# define function multi-processing 

def mLProcess(queries):
    with multiprocessing.Pool() as pool:
        pool.map(doCountQuery, queries)       

In [61]:
# Q1.Count number of rows in largest table "review"
queries = ["""SELECT COUNT(*) FROM review;"""]*20  #multiply by ammount of users (1, 10, 20, 50, 100)

In [62]:
%%time
mLProcess(queries)

# 8GB RAM VM
# Running each amount of processes 7 times to calculate the average result 
#1. 2.04s, 1.93s, 1.83s, 1.73s, 1.83s, 1.83s, 1.84s 
#10. 11.4s, 11s, 11.1, 11.2s, 11.3s, 11.2, 11.3s
#20. 54.2s, 22.4s, 22.2s, 22.2s, 21.9s, 22.1s, 22.1s
#50. 51.1s, 56.1s, 55.3s, 55.8s, 55.3, 55.9s, 55.5s 
#100. 1min 50s, 1min 50s, 1min 51s, 1min 52s, 1min 51s, 1min 53 s, 1min 53s

CPU times: user 109 ms, sys: 110 ms, total: 219 ms
Wall time: 55 s


In [63]:
#Q2. Select all distinc names in the user table
distinct = ["""SELECT DISTINCT(user_name) FROM y_user;"""] * 20

In [70]:
%%time
mLProcess(distinct)

# of Users:
#1. 1.64s, 1.63s, 1.73s, 1.73s, 1.73s, 1.64s, 1.63s
#10. 11.3s, 4.04s, 4.24s, 3.84s, 3.84s, 3.84s, 3.85s
#20. 7.37s, 7.37s, 7.07s, 6.97s, 7.26s, 7.38s, 6.96s
#50. 18.3s, 18.7s, 18.5s, 18.6s, 17.9s, 17.3s, 18.3s
#100. 37.1s, 37.5s, 36.7s, 36.6s, 36.5s, 36.9s, 37.1s

CPU times: user 44.7 ms, sys: 90.1 ms, total: 135 ms
Wall time: 7.37 s


In [71]:
#Q3. Count occurrence of each name in the user tabel
occurrence = ["""SELECT user_name, COUNT(*) FROM y_user GROUP BY user_name;"""] * 20

In [78]:
%%time
mLProcess(occurrence)
 
#1. 1.23s, 1.23s, 1.25s, 1.23s, 1.23s, 1.33s, 1.23s
#10. 5.16s, 5.04s, 5.05s, 5.15s, 4.95s, 5.25 s, 5.25s 
#20. 11.1s, 9.19s, 8.98s, 9.07s, 8.98s, 9.37s, 9.08s
#50. 23.1s, 22.7s, 22.8s, 23.2s, 23s, 22.2s, 22.9s
#100. 48.8s, 45.2s, 45.1, 45.1s, 45.3, 45.3s, 45s

CPU times: user 46.2 ms, sys: 92.4 ms, total: 139 ms
Wall time: 10.1 s


In [22]:
#Q4. Count number of reviews by each user 
num_reviews = ["""SELECT y_user.user_name, COUNT(review.text) 
                AS Number FROM y_user INNER JOIN review ON 
                y_user.user_id = review.user_id GROUP BY y_user.user_name LIMIT 100;"""] * 1

In [23]:
%%time
mLProcess(num_reviews)

# 8GB RAM VM
#1. 4min 9s, 3min 52s, 3min 32s, 4min 41s, 3min 50s, 4min 32s, 4min 44s.
#3. 24min 47s, 29min 35s, 28min 25s, 28min 20s, 27min 59s, 29min 53s, 27min 59s
#5. 1hr 20s, 1hr 1min 49s, 1h 49s, 52min 56s, 1hr 2min 10s, 1hr 37s, 1h 3s

CPU times: user 620 ms, sys: 359 ms, total: 979 ms
Wall time: 8min


In [24]:
#Q5.Multiple joins including all the tables
mul_joins = ["""SELECT y_user.user_name, review.stars, tip.text, business.name
               FROM y_user FULL OUTER JOIN review ON y_user.user_id = review.user_id
               FULL OUTER JOIN tip ON review.user_id = tip.user_id
               FULL OUTER JOIN business ON tip.business_id = business.business_id;"""]

In [25]:
%%time
mLProcess(mul_joins)

#8GB RAM
#1. 40min 12s, 37 min 40s, 37min 11s, 38min 50s, 36 min 28s, 37min 16s, 36min 47s

CPU times: user 3.51 s, sys: 1.94 s, total: 5.45 s
Wall time: 48min 43s


In [None]:
#References:
#https://www.postgresqltutorial.com/postgresql-python/query/
#https://gist.github.com/mangecoeur/9540178