In [1]:
# Import packages for use
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import time
import psutil
import cudf
from extract_load_functions import extraction, loading

#set export location
exportLocation = r'/home/jeff/'

#set number of test runs to prefrom
iterations = 31

# Connect to MySQL database
mydb = mysql.connector.connect(host="127.0.0.1",
                               user="root",
                               passwd="0861137MySQL!",
                               database="project1")

print("Your current DB connection is with: ",mydb)


etlTimerStart = time.perf_counter() 
### MEDIUM EXTRACTION ###
##################################################################################################################
# Join the order_line and item tables and capture CPU, RAM and elapsed time to perform operations
sql = "select * from order_line INNER JOIN item ON item.orderedItem =order_line.orderedItem"

# Pull the item table from MySQL
mediumExtraction  = extraction(mydb,sql,iterations,exportLocation,"mediumExtractionPrfm.csv")


### MEDIUM TRANSFORMATION ###
##################################################################################################################
# Preform light transformation workload.  In this case, identify all item descriptions with "Blue" in them 
# and change them to "Navy"

column_names = ["base CPU","base RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
medTransPrfm = pd.DataFrame(columns = column_names)

print("STARTING TRANSFORMATION...")
print("RUNNING...")

#Run 30 iterations to collect transformation df of performance metrics
transTimerStart= time.perf_counter() 
for sampleNoTransform in range(iterations):
    medTrans = mediumExtraction
    
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    
    #Working Code
    medTrans.item_desc = medTrans.item_desc.str.replace('Blue', 'Navy', regex=True)
    medTrans['extended_price'] = medTrans.selling_price * medTrans.eaches_qty
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Remove duplicate columns
    medTrans = medTrans.loc[:,~medTrans.columns.duplicated()]
    
    #Stop timer  
    stop = time.perf_counter()
    
    medTransPrfm = medTransPrfm.append(pd.DataFrame({'base CPU': baseCPU,
                                                         'base RAM': baseRAM,
                                                         'CPU_utilization': sampleCPU,
                                                         'RAM_utilization':  sampleRAM,
                                                         'CPU_d': sampleCPU-baseCPU,
                                                         'RAM_d': sampleRAM - baseRAM,
                                                         'elapsed_time':stop - start},
                                                        index=[1]), ignore_index=True)
transTimerEnd= time.perf_counter() 

print("Pandas medium load transformation metrics captured.")

#Export DF to csv
medTransPrfm.to_csv (r'/home/jeff/medTransPrfm.csv', index = False, header=True)

print("Pandas transformation metrics captured.\n")
print("Total TRANSFORMATION time: ", transTimerEnd - transTimerStart, "s")
print("Iterations performed: ", iterations)
print("Average TRANSFORMATION iteration time: ", medTransPrfm.elapsed_time.mean(), "s")
print("Average TRANSFORMATION CPU utilization: ", medTransPrfm.CPU_utilization.mean())
print("Average TRANSFORMATION RAM utilization: ", medTransPrfm.RAM_utilization.mean(), "\n\n")


### MEDIUM cuDF TRANSFORMATION ###
##################################################################################################################
# Preform light transformation workload.  In this case, identify all item descriptions with "Blue" in them 
# and change them to "Navy"
column_names = ["base CPU","base RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
medTransPrfmCU = pd.DataFrame(columns = column_names)

print("STARTING cuDF TRANSFORMATION")
print("RUNNING...")

#Run 30 iterations to collect transformation df of performance metrics
transCuTimerStart = time.perf_counter() 
for sampleNoTransform in range(iterations):
    ## Transform pandas df to cuDF
    medTransCU = cudf.DataFrame.from_pandas(medTrans)
    
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    
    #Working Code
    medTransCU.item_desc = medTransCU.item_desc.str.replace('Blue', 'Navy', regex=True)
    medTransCU['extended_price'] = medTransCU.selling_price * medTransCU.eaches_qty
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Stop timer  
    stop = time.perf_counter()
    
    
    medTransPrfmCU = medTransPrfmCU.append(pd.DataFrame({'base CPU': baseCPU,
                                                         'base RAM': baseRAM,
                                                         'CPU_utilization': sampleCPU,
                                                         'RAM_utilization':  sampleRAM,
                                                         'CPU_d': sampleCPU-baseCPU,
                                                         'RAM_d': sampleRAM - baseRAM,
                                                         'elapsed_time':stop - start},
                                                          index=[1]), ignore_index=True)
transCuTimerEnd= time.perf_counter() 

print("cuDF medium load transformation metrics captured.")

#Export cuDF performance to csv
medTransPrfmCU.to_csv (r'/home/jeff/medTransPrfmCU.csv', index = False, header=True)

print("cuDF performance metrics captured loading complete.\n")
print("Total cuDF TRANSFORMATION time: ", transCuTimerEnd - transCuTimerStart, "s")
print("Iterations performed: ", iterations)
print("Average cuDF TRANSFORMATION iteration time: ", medTransPrfmCU.elapsed_time.mean(), "s")
print("Average cuDF TRANSFORMATION CPU utilization: ", medTransPrfmCU.CPU_utilization.mean())
print("Average cuDF TRANSFORMATION RAM utilization: ", medTransPrfmCU.RAM_utilization.mean(),"\n\n")

### MEDIUM LOADING ###
###################################################################################################################

loading(medTrans,'mediumTransformation',iterations, exportLocation, "mediumLoadPrfm.csv")

etlTimerEnd = time.perf_counter() 
print('ETL is complete')
print('Elapsed ETL time is: ', (etlTimerEnd-etlTimerStart)/60, ' minutes')

Your current DB connection is with:  <mysql.connector.connection_cext.CMySQLConnection object at 0x7f7f1d96c610>

STARTING EXTRACTION...
Base EXTRACTION CPU utilization:  6.1
Base EXTRACTION RAM utilization:  20.0
RUNNING...
Data frame loading complete.

Total EXTRACTION time:  178.49906674199883 s
Iterations performed:  31
Average EXTRACTION iteration time:  0.7526186312902756 s
Average CPU utilization:  8.003225806451614
Average RAM utilization:  19.954838709677414 


STARTING TRANSFORMATION...
Base CPU utilization:  8.3
Base RAM utilization:  20.1
RUNNING...
Pandas medium load transformation metrics captured.
Pandas transformation metrics captured.

Total TRANSFORMATION time:  159.51823179200437 s
Iterations performed:  31
Average TRANSFORMATION iteration time:  0.13462985841972347 s
Average TRANSFORMATION CPU utilization:  6.358064516129033
Average TRANSFORMATION RAM utilization:  20.19677419354839 


STARTING cuDF TRANSFORMATION
Base CPU utilization:  6.2
Base RAM utilization:  20