In [1]:
# Import necessary packages
import pandas as pd
from pymongo import MongoClient
import time
import psutil
import cudf

In [3]:
# Connect to MongoDb
myclient = MongoClient("mongodb://localhost:27017/")
db = myclient.ds7330 
item = db.item

In [22]:
# LIGHT EXTRACTION #
### Pull item table and capture CPU, RAM and elapsed time to perform operations
                               
### light extraction performance metrics
column_names = ["CPU_utilization", "RAM_utilization", "elapsed_time"]
lightExtractionPrfm = pd.DataFrame(columns = column_names)

### base metrics
print("\nBase CPU utilization: ", psutil.cpu_percent())
print("\nBase RAM utilization: ", psutil.virtual_memory().percent)

### Run 30 iterations to collect a df of performance metrics
for sampleNo in range(1):

    #Initiate timer for query
    start = time.perf_counter()
    
    #insert data into dataFrame
    lightExtraction = pd.DataFrame(list(item.find()))
    print("Working on sample number: ", sampleNo+1,"\n")
    
    #Cast datatypes to objects
    convert_dict = {'_id': str, 
                'orderedItem': str,
                'item_desc':str,
                'selling_price':float
               } 
  
    lightExtraction = lightExtraction.astype(convert_dict) 
    
    #Stop timer  
    stop = time.perf_counter()
    
    #load df with performance metrics    
    lightExtractionPrfm = lightExtractionPrfm.append(pd.DataFrame({'CPU_utilization': psutil.cpu_percent(),
                                                                   'RAM_utilization':  psutil.virtual_memory().percent,
                                                                   'elapsed_time': stop - start}, 
                                                                   index=[1]), ignore_index=True)
    time.sleep(5)
    
print("Data frame loading complete.")


Base CPU utilization:  0.7

Base RAM utilization:  14.8
Working on sample number:  1 

Data frame loading complete.


In [None]:

print(lightExtraction.dtypes) 

In [6]:
#Export DF to csv
lightExtractionPrfm.to_csv (r'/home/jeff/lightExtractionPrfm_Mongo.csv', index = False, header=True)

In [41]:
#### LIGHT TRANSFORMATION ####
###################################################################################################################
# Preform light transformation workload.  In this case, identify all item descriptions with "Blue" in them 
# and change them to "Navy"


column_names = ["CPU_utilization", "RAM_utilization", "elapsed_time"]
lightTransPrfm = pd.DataFrame(columns = column_names)
#lightTrans.item_desc.str.contains("^Blue")

print("\nBase CPU utilization: ", psutil.cpu_percent())
print("\nBase RAM utilization: ", psutil.virtual_memory().percent)

#Run 30 iterations to collect transformation df of performance metrics
for sampleNoTransform in range(30):
    lightTrans = pd.DataFrame(lightExtraction)
    # Start Timer and progress tracker
    start = time.perf_counter()
    print("Working on sample number: ", sampleNoTransform+1,"\n")
    
    #Working Code
    lightTrans.drop(lightTrans.columns[[0]], axis = 1, inplace = True) 
    lightTrans.drop([406], axis = 0, inplace = True)
    lightTrans.item_desc = lightTrans.item_desc.str.replace('Blue', 'Navy', regex=True)
    #Stop timer  
    stop = time.perf_counter()
    
    
    lightTransPrfm = lightTransPrfm.append(pd.DataFrame({'CPU_utilization': psutil.cpu_percent(),
                                                         'RAM_utilization':  psutil.virtual_memory().percent,
                                                         'elapsed_time':stop - start},
                                                          index=[1]), ignore_index=True)
    #lightTrans = orderItemJoin
    time.sleep(5)
print("Pandas transformation metrics captured.")


Base CPU utilization:  1.0

Base RAM utilization:  14.2
Working on sample number:  1 

Working on sample number:  2 

Working on sample number:  3 

Working on sample number:  4 

Working on sample number:  5 

Working on sample number:  6 

Working on sample number:  7 

Working on sample number:  8 

Working on sample number:  9 

Working on sample number:  10 

Working on sample number:  11 

Working on sample number:  12 

Working on sample number:  13 

Working on sample number:  14 

Working on sample number:  15 

Working on sample number:  16 

Working on sample number:  17 

Working on sample number:  18 

Working on sample number:  19 

Working on sample number:  20 

Working on sample number:  21 

Working on sample number:  22 

Working on sample number:  23 

Working on sample number:  24 

Working on sample number:  25 

Working on sample number:  26 

Working on sample number:  27 

Working on sample number:  28 

Working on sample number:  29 

Working on sample number

In [9]:
lightTrans

Unnamed: 0,orderedItem,item_desc,selling_price
0,023401P0,RedDressP0,87.87
1,023401P2,RedDressP2,87.87
2,023401P4,RedDressP4,87.87
3,023401P6,RedDressP6,87.87
4,023401P8,RedDressP8,87.87
...,...,...,...
401,024610P2,TanCottonTeeP2,16.22
402,024610P4,TanCottonTeeP4,16.22
403,024610P6,TanCottonTeeP6,16.22
404,024610P8,TanCottonTeeP8,16.22


In [42]:
# Export Light Transformation Performance to local
lightTransPrfm.to_csv (r'/home/jeff/lightTransPrfm_mongodb.csv', index = False, header=True)

In [45]:
#### cuDF TRANSFORMATION ####
###################################################################################################################

lightTrans2 = lightExtraction
column_names = ["CPU_utilization", "RAM_utilization", "elapsed_time"]
lightTransPrfmCU = pd.DataFrame(columns = column_names)

print("\nBase CPU utilization: ", psutil.cpu_percent())
print("\nBase RAM utilization: ", psutil.virtual_memory().percent)

#Run 30 iterations to collect transformation df of performance metrics
for sampleNoTransform in range(30):
    ## Transform pandas df to cuDF
    lightTransCU = cudf.DataFrame.from_pandas(lightTrans2)
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    print("Working on sample number: ", sampleNoTransform+1,"\n")
    
    #Working Code
    lightTransCU.drop_column("_id")
    lightTransCU = lightTransCU[0:406]
    lightTransCU.item_desc = lightTransCU.item_desc.str.replace('Blue', 'Navy', regex=True)
    #Stop timer  
    stop = time.perf_counter()
    
    lightTransPrfmCU = lightTransPrfm.append(pd.DataFrame({'CPU_utilization': psutil.cpu_percent(),
                                                         'RAM_utilization':  psutil.virtual_memory().percent,
                                                         'elapsed_time':stop - start},
                                                          index=[1]), ignore_index=True)
    #lightTrans = orderItemJoin
    time.sleep(5)
print("cuDF performance metrics captured loading complete.")


Base CPU utilization:  0.7

Base RAM utilization:  14.3
Working on sample number:  1 

Working on sample number:  2 

Working on sample number:  3 

Working on sample number:  4 

Working on sample number:  5 

Working on sample number:  6 

Working on sample number:  7 

Working on sample number:  8 

Working on sample number:  9 

Working on sample number:  10 

Working on sample number:  11 

Working on sample number:  12 

Working on sample number:  13 

Working on sample number:  14 

Working on sample number:  15 

Working on sample number:  16 

Working on sample number:  17 

Working on sample number:  18 

Working on sample number:  19 

Working on sample number:  20 

Working on sample number:  21 

Working on sample number:  22 

Working on sample number:  23 

Working on sample number:  24 

Working on sample number:  25 

Working on sample number:  26 

Working on sample number:  27 

Working on sample number:  28 

Working on sample number:  29 

Working on sample number

In [46]:
# Export Light Transformation cuDF Performance to local
lightTransPrfmCU.to_csv (r'/home/jeff/lightTransPrfmCU_mongodb.csv', index = False, header=True)

In [38]:
print(lightTransCU[-10:])

    orderedItem           item_desc  selling_price
397     024609P6   OrangeCottonTeeP6          16.22
398     024609P8   OrangeCottonTeeP8          16.22
399    024609P10  OrangeCottonTeeP10          16.22
400     024610P0      TanCottonTeeP0          16.22
401     024610P2      TanCottonTeeP2          16.22
402     024610P4      TanCottonTeeP4          16.22
403     024610P6      TanCottonTeeP6          16.22
404     024610P8      TanCottonTeeP8          16.22
405    024610P10     TanCottonTeeP10          16.22
406                             None               
