In [1]:
# Import packages for use
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import time
import psutil
import cudf
from extract_load_functions import extraction, loading

#set export location
exportLocation = r'/home/jeff/'

#set number of test runs to prefrom
iterations = 31

# Connect to MySQL database
mydb = mysql.connector.connect(host="127.0.0.1",
                               user="root",
                               passwd="0861137MySQL!",
                               database="project1")

print("Your current DB connection is with: ",mydb)


etlTimerStart = time.perf_counter() 
### HEAVY EXTRACTION ###
##################################################################################################################
# Join the order_line and item tables and capture CPU, RAM and elapsed time to perform operations
# Pull the item table from MySQL

sql = "select * from order_header inner join order_line on order_header.ponum = order_line.ponum inner join item on order_line.orderedItem = item.orderedItem;"
heavyExtraction  = extraction(mydb,sql,iterations,exportLocation,"heavyExtractionPrfm.csv")


### HEAVY TRANSFORMATION ###
##################################################################################################################
# Preform heavy transformation workload.  Item, order_line, and orderHeader will be joined.  All item descriptions 
# with "Blue" in them are changed to "Navy."  Any clothing items with "Tee" will be modified to "CottonTee"
# Additionallay a computer column called extedned_price will be created by multiplying eaches and item_price.

column_names = ["CPU_utilization", "RAM_utilization", "elapsed_time"]
heavyPrfm = pd.DataFrame(columns = column_names)

print("STARTING TRANSFORMATION...")
print("Base CPU utilization: ", psutil.cpu_percent())
print("Base RAM utilization: ", psutil.virtual_memory().percent)
print("RUNNING...")

#Run n iterations to collect transformation df of performance metrics
transTimerStart= time.perf_counter() 
for sampleNoTransform in range(iterations):
    heavyTrans = heavyExtraction
    
    # Start Timer and progress tracker
    start = time.perf_counter()

    
    #Working Code
    heavyTrans.item_desc = heavyTrans.item_desc.str.replace('Blue', 'Navy', regex=True)
    heavyTrans.item_desc = heavyTrans.item_desc.str.replace('Tee', 'CottonTee', regex=True)
    heavyTrans['extended_price'] = heavyTrans.eaches_qty * heavyTrans.selling_price
    
    #Stop timer  
    stop = time.perf_counter()
    
    
    heavyPrfm = heavyPrfm.append(pd.DataFrame({'CPU_utilization': psutil.cpu_percent(),
                                               'RAM_utilization':  psutil.virtual_memory().percent,
                                               'elapsed_time':stop - start},
                                              index=[1]), ignore_index=True)
    # Remove duplicate columns
    heavyTrans = heavyTrans.loc[:,~heavyTrans.columns.duplicated()]
    
    time.sleep(5)
transTimerEnd= time.perf_counter() 

print("Pandas medium load transformation metrics captured.")

#Export pandas performance to csv
heavyPrfm.to_csv (r'/home/jeff/heavyPrfm.csv', index = False, header=True)

print("Pandas transformation metrics captured.\n")
print("Total TRANSFORMATION time: ", transTimerEnd - transTimerStart, "s")
print("Iterations performed: ", iterations)
print("Average TRANSFORMATION iteration time: ", heavyPrfm.elapsed_time.mean(), "s")
print("Average TRANSFORMATION CPU utilization: ", heavyPrfm.CPU_utilization.mean())
print("Average TRANSFORMATION RAM utilization: ", heavyPrfm.RAM_utilization.mean(), "\n\n")


### HEAVY cuDF TRANSFORMATION ###
##################################################################################################################
# Preform heavy transformation workload.  Item, order_line, and orderHeader will be joined.  All item descriptions 
# with "Blue" in them are changed to "Navy."  Any clothing items with "Tee" will be modified to "CottonTee"
# Additionallay a computer column called extedned_price will be created by multiplying eaches and item_price.

heavyTrans = heavyExtraction
column_names = ["CPU_utilization", "RAM_utilization", "elapsed_time"]
heavyPrfmCU = pd.DataFrame(columns = column_names)
heavyTrans['request_date'].astype('datetime64')

    
print("STARTING cuDF TRANSFORMATION")
print("Base CPU utilization: ", psutil.cpu_percent())
print("Base RAM utilization: ", psutil.virtual_memory().percent)
print("RUNNING...")

#Run 30 iterations to collect transformation df of performance metrics
transCuTimerStart = time.perf_counter() 
for sampleNoTransform in range(iterations):
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    ## Transform pandas df to cuDF
    heavyTransCU = cudf.DataFrame.from_pandas(heavyTrans)
    
    #Working Code
    heavyTransCU.item_desc = heavyTransCU.item_desc.str.replace('Blue', 'Navy', regex=True)
    heavyTransCU.item_desc = heavyTransCU.item_desc.str.replace('Tee', 'CottonTee', regex=True)
    heavyTransCU['extended_price'] = heavyTransCU.eaches_qty * heavyTransCU.selling_price
    
    #Stop timer  
    stop = time.perf_counter()
    
    
    heavyPrfmCU = heavyPrfmCU.append(pd.DataFrame({'CPU_utilization': psutil.cpu_percent(),
                                                   'RAM_utilization':  psutil.virtual_memory().percent,
                                                   'elapsed_time':stop - start},
                                                  index=[1]), ignore_index=True)

    time.sleep(5)
transCuTimerEnd= time.perf_counter() 

print("cuDF medium load transformation metrics captured.")

#Export cuDF performance to csv
heavyPrfmCU.to_csv (r'/home/jeff/heavyPrfmCU.csv', index = False, header=True)

print("cuDF performance metrics captured loading complete.\n")
print("Total cuDF TRANSFORMATION time: ", transCuTimerEnd - transCuTimerStart, "s")
print("Iterations performed: ", iterations)
print("Average cuDF TRANSFORMATION iteration time: ", heavyPrfmCU.elapsed_time.mean(), "s")
print("Average cuDF TRANSFORMATION CPU utilization: ", heavyPrfmCU.CPU_utilization.mean())
print("Average cuDF TRANSFORMATION RAM utilization: ", heavyPrfmCU.RAM_utilization.mean(),"\n\n")


### HEAVY LOADING ###
##################################################################################################################
loading(heavyTrans,'heavyTransformation',iterations, exportLocation, "heavyLoadPrfm.csv")

etlTimerEnd = time.perf_counter() 
print('ETL is complete')
print('Elapsed ETL time is: ', (etlTimerEnd-etlTimerStart)/60, ' minutes')

Your current DB connection is with:  <mysql.connector.connection_cext.CMySQLConnection object at 0x7f5e8c0db310>

STARTING EXTRACTION...
Base EXTRACTION CPU utilization:  6.5
Base EXTRACTION RAM utilization:  16.4
RUNNING...
Data frame loading complete.

Total EXTRACTION time:  96.5263144030032 s
Iterations performed:  31
Average EXTRACTION iteration time:  1.1111068077096264 s
Average CPU utilization:  3.980645161290323
Average RAM utilization:  17.119354838709686 


STARTING TRANSFORMATION...
Base CPU utilization:  1.8
Base RAM utilization:  17.1
RUNNING...
Pandas medium load transformation metrics captured.
Pandas transformation metrics captured.

Total TRANSFORMATION time:  164.050300253999 s
Iterations performed:  31
Average TRANSFORMATION iteration time:  0.2676652531287888 s
Average TRANSFORMATION CPU utilization:  2.138709677419355
Average TRANSFORMATION RAM utilization:  17.10000000000001 


STARTING cuDF TRANSFORMATION
Base CPU utilization:  2.2
Base RAM utilization:  17.1
RU



cuDF medium load transformation metrics captured.
cuDF performance metrics captured loading complete.

Total cuDF TRANSFORMATION time:  162.60455250600353 s
Iterations performed:  31
Average cuDF TRANSFORMATION iteration time:  0.24003174706458888 s
Average cuDF TRANSFORMATION CPU utilization:  2.109677419354839
Average cuDF TRANSFORMATION RAM utilization:  17.929032258064506 


STARTING LOAD...
Load database connection is:  Engine(mysql+mysqlconnector://root:***@127.0.0.1/datalake)
Base CPU utilization:  1.8
Base RAM utilization:  17.9
RUNNING...
Loading complete.
Loading metrics captured loading complete.

Total LOADING time:  277.7522086370009 s
Iterations performed:  31
Average LOADING iteration time:  6.9571366554512 s
Average LOADING CPU utilization:  5.538709677419355
Average LOADING RAM utilization:  31.761290322580646 


ETL is complete
Elapsed ETL time is:  11.684310402583288  minutes
