#Extracting Resources

##1. Import Required Libraries

In [1]:
# Import libraries
import pandas as pd
from numpy import mean, std, min, max
import matplotlib.pyplot as plt
from google.colab import files

##2. Read & Pre-Process Data

In [2]:
# Set WMS & workflow name
wms = "Luigi"
workflow = "BSM_Search"
# Load the text file into a pandas DataFrame with no header row
resources_df = pd.read_csv('/content/Luigi-BSM-Search-Resources.txt', delimiter=',', header=None)

In [3]:
# Define column names
COLUMN_NAMES = ["Date", "Core_Numbers", "Core_1_Usage", "Core_2_Usage", "Core_3_Usage", "Core_4_Usage", "CPU_Usage",
                "Total_Memory", "Used_Memory", "Memory_Usage",
                "Total_Disk", "Used_Disk", "Disk_Usage",
                "Sent_Bytes", "Received_Bytes"]

In [4]:
# Set column names
resources_df.columns = COLUMN_NAMES

In [5]:
# Insert duration column
resources_df.insert(1, "Duration (Seconds)", [second for second in range(1, len(resources_df)+1)])

##3. Calculate & Save Resources Percentages

In [6]:
# Extract useful columns
usage_percentages_df = resources_df[["Duration (Seconds)", "CPU_Usage", "Memory_Usage", "Disk_Usage", "Sent_Bytes", "Received_Bytes"]]

In [7]:
# Transforming the columns
usage_percentages_df["Disk_Usage"] = usage_percentages_df["Disk_Usage"] - usage_percentages_df["Disk_Usage"][0]
usage_percentages_df["Sent_Bytes"] = (usage_percentages_df["Sent_Bytes"] - usage_percentages_df["Sent_Bytes"][0]) / (1024**3)
usage_percentages_df["Received_Bytes"] = (usage_percentages_df["Received_Bytes"] - usage_percentages_df["Received_Bytes"][0]) / (1024**3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usage_percentages_df["Disk_Usage"] = usage_percentages_df["Disk_Usage"] - usage_percentages_df["Disk_Usage"][0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usage_percentages_df["Sent_Bytes"] = (usage_percentages_df["Sent_Bytes"] - usage_percentages_df["Sent_Bytes"][0]) / (1024**3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

In [8]:
# Define a function to calculate the average of the usages
def calculate_average(component, base_usage, target_usage, disk_base_usage, disk_target_usage):
  if component == "BASE":
    AVERAGE_CPU_USAGE = mean(target_usage["CPU_Usage"])
    AVERAGE_MEMORY_USAGE = mean(target_usage["Memory_Usage"])
    AVERAGE_DISK_USAGE = mean(disk_target_usage["Disk_Usage"])
  elif component == "SERVER":
    AVERAGE_CPU_USAGE =  mean(target_usage["CPU_Usage"]) - mean(base_usage["CPU_Usage"])
    AVERAGE_MEMORY_USAGE = mean(target_usage["Memory_Usage"]) - mean(base_usage["Memory_Usage"])
    AVERAGE_DISK_USAGE = mean(disk_target_usage["Disk_Usage"]) - mean(disk_base_usage["Disk_Usage"])
  elif component == "WORKFLOW":
    AVERAGE_CPU_USAGE =  mean(target_usage["CPU_Usage"]) - mean(base_usage["CPU_Usage"][:60])
    AVERAGE_MEMORY_USAGE = mean(target_usage["Memory_Usage"]) - mean(base_usage["Memory_Usage"][:60])
    AVERAGE_DISK_USAGE = mean(disk_target_usage["Disk_Usage"]) - mean(disk_base_usage["Disk_Usage"][:60])
  return AVERAGE_CPU_USAGE, AVERAGE_MEMORY_USAGE, AVERAGE_DISK_USAGE

In [9]:
# Define a function to calculate the STD of the usages
def calculate_std(component, base_usage, target_usage, disk_base_usage, disk_target_usage):
  if component == "BASE":
    STD_CPU_USAGE = std(target_usage["CPU_Usage"])
    STD_MEMORY_USAGE = std(target_usage["Memory_Usage"])
    STD_DISK_USAGE = std(disk_target_usage["Disk_Usage"])
  elif component == "SERVER":
    STD_CPU_USAGE = std(target_usage["CPU_Usage"] - mean(base_usage["CPU_Usage"]))
    STD_MEMORY_USAGE = std(target_usage["Memory_Usage"] - mean(base_usage["Memory_Usage"]))
    STD_DISK_USAGE = std(disk_target_usage["Disk_Usage"] - mean(disk_base_usage["Disk_Usage"]))
  elif component == "WORKFLOW":
    STD_CPU_USAGE = std(target_usage["CPU_Usage"] - mean(base_usage["CPU_Usage"][:60]))
    STD_MEMORY_USAGE = std(target_usage["Memory_Usage"] - mean(base_usage["Memory_Usage"][:60]))
    STD_DISK_USAGE = std(disk_target_usage["Disk_Usage"] - mean(disk_base_usage["Disk_Usage"][:60]))
  return STD_CPU_USAGE, STD_MEMORY_USAGE, STD_DISK_USAGE

In [10]:
# Define a function to calculate the CV of the usages
def calculate_cv(std, mean):
  if mean == 0:
    return 0
  else:
    return (std/mean)*100

In [11]:
# Define a function to return the minimum value for usage
def return_min(target_usage, disk_target_usage):
  return min(target_usage["CPU_Usage"]), min(target_usage["Memory_Usage"]), min(disk_target_usage["Disk_Usage"])

In [12]:
# Define a function to return the maximum value for usage
def return_max(target_usage, disk_target_usage):
  return max(target_usage["CPU_Usage"]), max(target_usage["Memory_Usage"]), max(disk_target_usage["Disk_Usage"])

In [13]:
# Define a function to calculate the usages
def caluculate_usage(component, base_usage, target_usage, disk_base_usage, disk_target_usage):
  results_dictionary = {}

  AVERAGE_CPU_USAGE, AVERAGE_MEMORY_USAGE, AVERAGE_DISK_USAGE = calculate_average(component, base_usage, target_usage, disk_base_usage, disk_target_usage)
  STD_CPU_USAGE, STD_MEMORY_USAGE, STD_DISK_USAGE = calculate_std(component, base_usage, target_usage, disk_base_usage, disk_target_usage)
  CV_CPU_USAGE, CV_MEMORY_USAGE, CV_DISK_USAGE = calculate_cv(STD_CPU_USAGE, AVERAGE_CPU_USAGE), calculate_cv(STD_MEMORY_USAGE, AVERAGE_MEMORY_USAGE), calculate_cv(STD_DISK_USAGE, AVERAGE_DISK_USAGE)
  
  MIN_CPU_USAGE, MIN_MEMORY_USAGE, MIN_DISK_USAGE =  return_min(target_usage, disk_target_usage)
  MAX_CPU_USAGE, MAX_MEMORY_USAGE, MAX_DISK_USAGE =  return_max(target_usage, disk_target_usage)

  results_dictionary["AVERAGE_CPU_USAGE"], results_dictionary["AVERAGE_MEMORY_USAGE"], results_dictionary["AVERAGE_DISK_USAGE"] = AVERAGE_CPU_USAGE, AVERAGE_MEMORY_USAGE, AVERAGE_DISK_USAGE
  results_dictionary["STD_CPU_USAGE"], results_dictionary["STD_MEMORY_USAGE"], results_dictionary["STD_DISK_USAGE"] = STD_CPU_USAGE, STD_MEMORY_USAGE, STD_DISK_USAGE
  results_dictionary["CV_CPU_USAGE"], results_dictionary["CV_MEMORY_USAGE"], results_dictionary["CV_DISK_USAGE"] = CV_CPU_USAGE, CV_MEMORY_USAGE, CV_DISK_USAGE
  results_dictionary["MIN_CPU_USAGE"], results_dictionary["MIN_MEMORY_USAGE"], results_dictionary["MIN_DISK_USAGE"] = MIN_CPU_USAGE, MIN_MEMORY_USAGE, MIN_DISK_USAGE
  results_dictionary["MAX_CPU_USAGE"], results_dictionary["MAX_MEMORY_USAGE"], results_dictionary["MAX_DISK_USAGE"] = MAX_CPU_USAGE, MAX_MEMORY_USAGE, MAX_DISK_USAGE
  results_dictionary["SENT_BYTES"] = disk_target_usage["Sent_Bytes"].values[-1]
  results_dictionary["RECEIVED_BYTES"] = disk_target_usage["Received_Bytes"].values[-1]

  return results_dictionary

In [14]:
# Save the calculation in dictionaries
RESOURCES_USAGE = {}
BASE_USAGE = caluculate_usage("BASE", None, usage_percentages_df[-60:], None, usage_percentages_df[:60])
SERVER_USAGE = caluculate_usage("SERVER", usage_percentages_df[-60:], usage_percentages_df[-120:-60], usage_percentages_df[:60], usage_percentages_df[60:120])
WORKFLOW_USAGE = caluculate_usage("WORKFLOW", usage_percentages_df[-60:], usage_percentages_df[120:-120], usage_percentages_df[:60], usage_percentages_df[120:-120])

In [15]:
TOTAL_USAGE = {
               "AVERAGE_CPU_USAGE":0, "AVERAGE_MEMORY_USAGE":0, "AVERAGE_DISK_USAGE":0,
               "STD_CPU_USAGE":0, "STD_MEMORY_USAGE":0, "STD_DISK_USAGE":0,
               "CV_CPU_USAGE":0, "CV_MEMORY_USAGE":0, "CV_DISK_USAGE":0,
               "MIN_CPU_USAGE":min(usage_percentages_df["CPU_Usage"]), "MIN_MEMORY_USAGE":min(usage_percentages_df["Memory_Usage"]), "MIN_DISK_USAGE":min(usage_percentages_df["Disk_Usage"]),
               "MAX_CPU_USAGE":max(usage_percentages_df["CPU_Usage"]), "MAX_MEMORY_USAGE":max(usage_percentages_df["Memory_Usage"]), "MAX_DISK_USAGE":max(usage_percentages_df["Disk_Usage"]),
               "SENT_BYTES": 0 , "RECEIVED_BYTES": 0
}

In [16]:
# Save the base, server & workflow usages to the resources usage dictionary
RESOURCES_USAGE = {"BASE_USAGE":BASE_USAGE, "SERVER_USAGE":SERVER_USAGE, "WORKFLOW_USAGE":WORKFLOW_USAGE}

In [17]:
# Calculate the total usage
for component in RESOURCES_USAGE.keys():
  for RESOURCES_TYPE in RESOURCES_USAGE[component].keys():
    if "MIN" in RESOURCES_TYPE or "MAX" in RESOURCES_TYPE:
      continue
    else:
      TOTAL_USAGE[RESOURCES_TYPE] += RESOURCES_USAGE[component][RESOURCES_TYPE]

In [18]:
# Save the total usage to the resources usage dictionary
RESOURCES_USAGE = {"TOTAL_USAGE":TOTAL_USAGE, "BASE_USAGE":BASE_USAGE, "SERVER_USAGE":SERVER_USAGE, "WORKFLOW_USAGE":WORKFLOW_USAGE}

In [19]:
# Create DataFrames of the results
RESULTS_DF = pd.DataFrame(RESOURCES_USAGE)
T_RESULTS_DF = RESULTS_DF.T
AVERAGE_DF = T_RESULTS_DF.iloc[:, [i for i in range (0,3)]]
STD_DF = T_RESULTS_DF.iloc[:, [i for i in range (3,6)]]
CV_DF = T_RESULTS_DF.iloc[:, [i for i in range (6,9)]]
MIN_DF = T_RESULTS_DF.iloc[:, [i for i in range (9,12)]]
MAX_DF = T_RESULTS_DF.iloc[:, [i for i in range (12,15)]]
CPU_DF = T_RESULTS_DF.iloc[:, [i for i in range (0,T_RESULTS_DF.shape[1]-2,3)]]
MEMORY_DF = T_RESULTS_DF.iloc[:, [i for i in range (1,T_RESULTS_DF.shape[1]-2,3)]]
DISK_DF = T_RESULTS_DF.iloc[:, [i for i in range (2,T_RESULTS_DF.shape[1]-2,3)]]
SENT_BYTES_DF = T_RESULTS_DF.iloc[:, [-2]]
RECEIVED_BYTES_DF = T_RESULTS_DF.iloc[:, [-1]]

In [20]:
# Save the results
RESULTS_DF.to_csv(f'{wms}_{workflow}_Results.csv')
T_RESULTS_DF.to_csv(f'{wms}_{workflow}_Transposed_Results.csv')
AVERAGE_DF.to_csv(f'{wms}_{workflow}_Average.csv')
STD_DF.to_csv(f'{wms}_{workflow}_STD.csv')
CV_DF.to_csv(f'{wms}_{workflow}_CV.csv')
MIN_DF.to_csv(f'{wms}_{workflow}_Min.csv')
MAX_DF.to_csv(f'{wms}_{workflow}_Max.csv')
CPU_DF.to_csv(f'{wms}_{workflow}_CPU.csv')
MEMORY_DF.to_csv(f'{wms}_{workflow}_Memory.csv')
DISK_DF.to_csv(f'{wms}_{workflow}_Disk.csv')
SENT_BYTES_DF.to_csv(f'{wms}_{workflow}_Sent_Bytes.csv')
RECEIVED_BYTES_DF.to_csv(f'{wms}_{workflow}_Received_Bytes.csv')

In [21]:
# Download CSV files to your local device
files.download(f'{wms}_{workflow}_Results.csv')
files.download(f'{wms}_{workflow}_Transposed_Results.csv')
files.download(f'{wms}_{workflow}_Average.csv')
files.download(f'{wms}_{workflow}_STD.csv')
files.download(f'{wms}_{workflow}_CV.csv')
files.download(f'{wms}_{workflow}_Min.csv')
files.download(f'{wms}_{workflow}_Max.csv')
files.download(f'{wms}_{workflow}_CPU.csv')
files.download(f'{wms}_{workflow}_Memory.csv')
files.download(f'{wms}_{workflow}_Disk.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
files.download(f'{wms}_{workflow}_Sent_Bytes.csv')
files.download(f'{wms}_{workflow}_Received_Bytes.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##4. Line Graph Data

In [23]:
# Extract useful columns
usage_data_df = resources_df[["Duration (Seconds)", "CPU_Usage", "Memory_Usage", "Disk_Usage"]]
usage_data_df["Disk_Usage"] = usage_data_df["Disk_Usage"] - usage_data_df["Disk_Usage"][0]
usage_data_df.to_csv(f'{wms}_{workflow}_Timeseries.csv', index = False)
files.download(f'{wms}_{workflow}_Timeseries.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usage_data_df["Disk_Usage"] = usage_data_df["Disk_Usage"] - usage_data_df["Disk_Usage"][0]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>