In [4]:
# Import packages for use
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import time
import psutil
import cudf
from extract_load_functions import extraction, loading, performance

#set export location
exportLocation = r'/home/jeff/'

#set number of test runs to prefrom
iterations = 31

# Connect to MySQL database
mydb = mysql.connector.connect(host="127.0.0.1",
                               user="root",
                               passwd="********",
                               database="project1")

print("Your current DB connection is with: ",mydb)


etlTimerStart = time.perf_counter() 
### HEAVY EXTRACTION ###
##################################################################################################################
# Join the order_line and item tables and capture CPU, RAM and elapsed time to perform operations
# Pull the item table from MySQL

sql = "select * from order_header inner join order_line on order_header.ponum = order_line.ponum inner join item on order_line.orderedItem = item.orderedItem;"
heavyExtraction  = extraction(mydb,sql,iterations,exportLocation,"heavyExtractionPrfm.csv")


### HEAVY TRANSFORMATION ###
##################################################################################################################
# Preform heavy transformation workload.  Item, order_line, and orderHeader will be joined.  All item descriptions 
# with "Blue" in them are changed to "Navy."  Any clothing items with "Tee" will be modified to "CottonTee"
# Additionallay a computer column called extedned_price will be created by multiplying eaches and item_price.

column_names = ["base_CPU","base_RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
heavyPrfm = pd.DataFrame(columns = column_names)

print("STARTING TRANSFORMATION...")
print("RUNNING...")

#Run n iterations to collect transformation df of performance metrics
transTimerStart= time.perf_counter() 
for sampleNoTransform in range(iterations):
    heavyTrans = heavyExtraction
    
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    # Start Timer and progress tracker
    start = time.perf_counter()

    
    #Working Code
    heavyTrans.item_desc = heavyTrans.item_desc.str.replace('Blue', 'Navy', regex=True)
    heavyTrans.item_desc = heavyTrans.item_desc.str.replace('Tee', 'CottonTee', regex=True)
    heavyTrans['extended_price'] = heavyTrans.eaches_qty * heavyTrans.selling_price
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Stop timer  
    stop = time.perf_counter()
    
    
    heavyPrfm = heavyPrfm.append(pd.DataFrame({'base_CPU': baseCPU,
                                               'base_RAM': baseRAM,
                                               'CPU_utilization': sampleCPU,
                                               'RAM_utilization':  sampleRAM,
                                               'CPU_d': sampleCPU-baseCPU,
                                               'RAM_d': sampleRAM - baseRAM,
                                               'elapsed_time':stop - start},
                                              index=[1]), ignore_index=True)
    
    # Remove duplicate columns
    heavyTrans = heavyTrans.loc[:,~heavyTrans.columns.duplicated()]
transTimerEnd= time.perf_counter() 

#Export pandas performance to csv
heavyPrfm.to_csv (r'/home/jeff/heavyPrfm.csv', index = False, header=True)

print("Pandas transformation metrics captured.\n")
performance(heavyPrfm,iterations,transTimerStart,transTimerEnd)

### HEAVY cuDF TRANSFORMATION ###
##################################################################################################################
# Preform heavy transformation workload.  Item, order_line, and orderHeader will be joined.  All item descriptions 
# with "Blue" in them are changed to "Navy."  Any clothing items with "Tee" will be modified to "CottonTee"
# Additionallay a computer column called extedned_price will be created by multiplying eaches and item_price.

heavyTrans = heavyExtraction
column_names = ["base_CPU","base_RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
heavyPrfmCU = pd.DataFrame(columns = column_names)
heavyTrans['request_date'].astype('datetime64')

    
print("STARTING cuDF TRANSFORMATION")
print("RUNNING...")

#Run 30 iterations to collect transformation df of performance metrics
transCuTimerStart = time.perf_counter() 
for sampleNoTransform in range(iterations):
    
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    ## Transform pandas df to cuDF
    heavyTransCU = cudf.DataFrame.from_pandas(heavyTrans)
    
    #Working Code
    heavyTransCU.item_desc = heavyTransCU.item_desc.str.replace('Blue', 'Navy', regex=True)
    heavyTransCU.item_desc = heavyTransCU.item_desc.str.replace('Tee', 'CottonTee', regex=True)
    heavyTransCU['extended_price'] = heavyTransCU.eaches_qty * heavyTransCU.selling_price
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Stop timer  
    stop = time.perf_counter()
    
    
    heavyPrfmCU = heavyPrfmCU.append(pd.DataFrame({'base_CPU': baseCPU,
                                               'base_RAM': baseRAM,
                                               'CPU_utilization': sampleCPU,
                                               'RAM_utilization':  sampleRAM,
                                               'CPU_d': sampleCPU-baseCPU,
                                               'RAM_d': sampleRAM - baseRAM,
                                               'elapsed_time':stop - start},
                                              index=[1]), ignore_index=True)
transCuTimerEnd= time.perf_counter() 

print("cuDF medium load transformation metrics captured.")

#Export cuDF performance to csv
heavyPrfmCU.to_csv (r'/home/jeff/heavyPrfmCU.csv', index = False, header=True)

print("cuDF performance metrics captured loading complete.\n")
performance(heavyPrfmCU,iterations,transCuTimerStart,transCuTimerEnd)

### HEAVY LOADING ###
##################################################################################################################
loading(heavyTrans,'heavyTransformation',iterations, exportLocation, "heavyLoadPrfm.csv")

etlTimerEnd = time.perf_counter() 
print('ETL is complete')
print('Elapsed ETL time is: ', (etlTimerEnd-etlTimerStart)/60, ' minutes')

Your current DB connection is with:  <mysql.connector.connection_cext.CMySQLConnection object at 0x7ff4af2a2390>

STARTING EXTRACTION...
RUNNING...
Data frame loading complete.

Iterations performed:  31
TOTAL Process time:  191.88926637700933 s
Average iteration time:  1.1850192627100642 s
Average BASE CPU:  2.8612903225806448
Average BASE RAM:  2.8612903225806448
Average CPU Performance  5.958064516129033
Average RAM Performance  -0.003225806451612949
Average CPU utilization:  8.819354838709677
Average RAM utilization:  29.70322580645163 


STARTING TRANSFORMATION...
RUNNING...
Pandas transformation metrics captured.

Iterations performed:  31
TOTAL Process time:  164.5493087950017 s
Average iteration time:  0.28146363019280424 s
Average BASE CPU:  2.761290322580646
Average BASE RAM:  2.761290322580646
Average CPU Performance  4.5451612903225795
Average RAM Performance  0.003225806451612949
Average CPU utilization:  7.306451612903226
Average RAM utilization:  29.712903225806468 


ST