# Data Analysis Script

The purpose of this script is to generate descriptive statistics of the experiment. Tasks conducted include:
- Automatically combine experiment runs into a single dataframe
- Generate a descriptive statistic dataframe
- Visualise the dataframe

In [453]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
import os

In [454]:
# Define paths
arrs = ["11", "21", "12"]
subjects = ["1", "2", "3", "4"]
measurements = [
    "node_cpu_seconds_total_data",
    "node_disk_read_bytes_total_data",
    "node_disk_written_bytes_total_data",
    "node_filesystem_free_bytes_data",
    "node_filesystem_size_bytes_data",
    "node_memory_MemAvailable_bytes_data",
    "node_memory_MemTotal_bytes_data",
    "node_network_receive_bytes_total_data",
    "node_network_transmit_bytes_total_data",
    "pijuice_battery_current_data",
    "pijuice_battery_level_data",
    "pijuice_battery_temperature_data",
    "pijuice_battery_voltage_data",
    "pijuice_io_current_data",
    "pijuice_io_voltage_data"
]
multi_indexes_list = [
    ["time", "cpu", "mode"],
    ["time", "device"],
    ["time", "device"],
    ["time", "mountpoint"],
    ["time", "mountpoint"],
    ["time"],
    ["time"],
    ["time", "device"],
    ["time", "device"],
    ["time"],
    ["time"],
    ["time"],
    ["time"],
    ["time"],
    ["time"]    
]

# path_export_datadir = "../Processed_Data/Physical-Arrangement-{}/SUB-{}/{}/".format(arr, subject, measurement)

In [455]:
def loadExpFiles(arr_idx, sub_idx, m_idx):
    # Get the file names of experiment runs and store them in a dictionary
    path = "../Processed_Data/Physical-Arrangement-{}/SUB-{}/{}".format(arrs[arr_idx], subjects[sub_idx], measurements[m_idx])

    data_files = {}
    for subdir, dirs, files in os.walk(path):
        for file in files:
            file_no_extension = file.split('.')[0]
            exp_run = file_no_extension.split('_')[0]
            instance = file_no_extension.split('_')[1]
            if instance in data_files.keys():
                data_files[instance].append(file)
            else:
                data_files[instance] = []
                data_files[instance].append(file)

    for key in data_files.keys():
        data_files[key].sort()

    # Load csv files
    dfs = {}
    for key in data_files.keys():
        dfs[key] = [pd.read_csv("{}/{}".format(path,file)) for file in data_files[key]]
    
    # dfs[gateway1] = dataframe
    return dfs

In [456]:
def saveDataframes(path, combined_dfs):
    # Get the file names of experiment runs and store them in a dictionary
    for scenario in range(len(combined_dfs)):
        dfs = combined_dfs[scenario]
        for instance in dfs.keys():
            df = dfs[instance]
            df.to_csv('{}/combined_{}_{}.csv'.format(path,scenario,instance))

In [457]:
# Helper function: Applying function to each dataframe in a set of scenario dataframes (each contain multiple experiment runs)
def applyToScenarioDFs(scenario_dfs, function, *args):
    for i in range(len(scenario_dfs)):
        scenario = scenario_dfs[i]
        for key in scenario:
            dfs = scenario[key]
            for j in range(len(dfs)):
                df = dfs[j]
                # Passing dataframe, scenario, instance, and iteration
                function(df, i, key, j, *args)

In [458]:
# Args[0]: Calculating diffs from the totals
# Args[1]: Additional calculations to be applied on datasets for transformation
# Args[2]: Gather metrics a combined dataframe
def analyseTotalMetrics(arr, measurement, *args):
    #Load dataframes measures
    scenario_dfs = [loadExpFiles(arr, i, measurement) for i in range(4)]
    
    # Calculate Diff Froms Totals
    applyToScenarioDFs(scenario_dfs, args[0])
    
    # Additional calculation steps
    applyToScenarioDFs(scenario_dfs, args[1])
    
    # Gathering results into a combined df
    combined_dfs = [
    {"gateway1": pd.DataFrame(), "gateway2" : pd.DataFrame()},
    {"gateway1": pd.DataFrame(), "gateway2" : pd.DataFrame()},
    {"gateway1": pd.DataFrame(), "gateway2" : pd.DataFrame()},
    {"gateway1": pd.DataFrame(), "gateway2" : pd.DataFrame()}
               ]
    applyToScenarioDFs(scenario_dfs, args[2], combined_dfs)
    
    # Generate combined statistics describing the entire subject from the dataset
    combined_metrics = pd.DataFrame({
    "gateway1" : [combined_dfs[i]['gateway1'].describe().loc['mean'].mean() for i in range (4)],
    "gateway2" : [combined_dfs[i]['gateway2'].describe().loc['mean'].mean() for i in range (4)],
    })
    
    # Write files to the disk
    path_export_datadir = "../Processed_Data/Physical-Arrangement-{}/Combined/{}".format(arrs[arr], measurements[measurement])
    os.makedirs(path_export_datadir)
    combined_metrics.to_csv("{}/combined_average.csv".format(path_export_datadir))
    
    return combined_metrics

# Analysing CPU Time

Finally figured out the CPU metrics. Node Exporter records the total cpu time since boot, thus I need to find the differences between these records if I want to find out how much time the CPU has spent on computing. After doing so, the result was perfectly as expected. 

In [379]:
# Analysing CPU

#Load CPU consumption measures
dfs_cpu_11_1 = loadExpFiles(0,0,0)
dfs_cpu_11_2 = loadExpFiles(0,1,0)
dfs_cpu_11_3 = loadExpFiles(0,2,0)
dfs_cpu_11_4 = loadExpFiles(0,3,0)

scenario_dfs = [dfs_cpu_11_1, dfs_cpu_11_2, dfs_cpu_11_3, dfs_cpu_11_4]

In [380]:
# CPU-specific functions applied on dataframes

# Calculate diffs between rows in the loaded dataframes
def calculateCPUDiffFromTotal(df, scenario, key, i):
    diffDF = pd.DataFrame()
    for cpu in range(4):
        tmpDF = df.query("cpu == {}".format(cpu)).drop(["time", 'cpu'], axis = 1).diff()
        diffDF = diffDF.append(tmpDF)
    df.iloc[:,2:]=diffDF

# Calculate CPU load
def calCPULoadSeries(df, scenario, key, i):
    df["total"] = df["idle"] + df["iowait"] + df ['irq'] + df['nice'] + df['softirq'] + df['steal'] + df['system'] + df['user']
    df["user_perc"] = df["user"] / df['total'] * 100
    df["load_perc"] = (df['total'] - df['idle'] - df['iowait']) / df['total'] * 100
    
# Gather CPU load series
def gatherCPULoadSeries(df, scenario, key, i, *args):
    combined_dfs = args[0]
    combined_dfs[scenario][key]["load_perc_{}".format(i)] = df["load_perc"]

In [460]:
analyseTotalMetrics(1,0,calculateCPUDiffFromTotal,calCPULoadSeries,gatherCPULoadSeries)

Unnamed: 0,gateway1,gateway2
0,0.734167,0.712618
1,0.970477,1.060672
2,2.452875,2.545009
3,7.653357,7.579418


# Analysing disk write in bytes

Because this is a total metric, the analysis would be compatible to that of the CPU. Let's see if I can generalize the process

I skipped disk read because I saw no activity what soever there, which indicates that something is not quite right.

In [450]:
# Disk write-specific functions applied on dataframes

# Calculate diffs between rows in the loaded dataframes
def calculateDWDiffFromTotal(df, scenario, key, i):
    diffDF = df.drop(["time"], axis = 1).diff()
    df.iloc[:,1:]=diffDF

# Calculate Disk Write load (not relevant)
def calDWSeries(df, scenario, key, i):
    pass
    
# Gather Disk write series
def gatherDWSeries(df, scenario, key, i, *args):
    combined_dfs = args[0]
    combined_dfs[scenario][key]["dw_{}".format(i)] = df["mmcblk0"]

In [452]:
analyseTotalMetrics(1,2,calculateDWDiffFromTotal,calDWSeries,gatherDWSeries)

Unnamed: 0,gateway1,gateway2
0,36646.48951,38390.452214
1,28868.923077,25989.668998
2,62405.519814,59539.692308
3,104585.846154,101436.270396


# Analysing System Free Bytes

This metrics does not require calculating differences.

In [467]:
# Disk system free bytes functions applied on dataframes

# Calculate diffs between rows in the loaded dataframes
def calculateFreeDiffFromTotal(df, scenario, key, i):
    pass

# Calculate Disk Write load (not relevant)
def calFreeSeries(df, scenario, key, i):
    pass
    
# Gather Disk write series
def gatherFreeSeries(df, scenario, key, i, *args):
    combined_dfs = args[0]
    combined_dfs[scenario][key]["free_space_{}".format(i)] = df["/"]

In [468]:
analyseTotalMetrics(0,3,calculateFreeDiffFromTotal,calFreeSeries,gatherFreeSeries)
analyseTotalMetrics(1,3,calculateFreeDiffFromTotal,calFreeSeries,gatherFreeSeries)

                       time            /      /boot       /run  /run/lock  \
0  2020-02-24T12:29:22.444Z  12046438400  208601088  468467712    5238784   
1  2020-02-24T12:29:32.444Z  12046438400  208601088  468463616    5238784   
2  2020-02-24T12:29:42.444Z  12046438400  208601088  468463616    5238784   
3  2020-02-24T12:29:52.444Z  12046438400  208601088  468463616    5238784   
4  2020-02-24T12:30:02.444Z  12046438400  208601088  468463616    5238784   

   /run/user/1000  
0     101675008.0  
1     101675008.0  
2     101675008.0  
3     101675008.0  
4     101675008.0  
                       time            /      /boot       /run  /run/lock  \
0  2020-02-24T13:44:02.444Z  12047134720  208601088  461950976    5238784   
1  2020-02-24T13:44:12.444Z  12047126528  208601088  461946880    5238784   
2  2020-02-24T13:44:22.444Z  12047126528  208601088  461946880    5238784   
3  2020-02-24T13:44:32.444Z  12047126528  208601088  461946880    5238784   
4  2020-02-24T13:44:42.444Z  120

4       101675008  
                       time            /      /boot       /run  /run/lock  \
0   2020-02-24T09:07:57.06Z  12050726912  208601088  487821312    5238784   
1  2020-02-24T09:08:07.061Z  12050714624  208601088  487817216    5238784   
2  2020-02-24T09:08:17.061Z  12050690048  208601088  487817216    5238784   
3  2020-02-24T09:08:27.061Z  12050661376  208601088  487817216    5238784   
4  2020-02-24T09:08:37.061Z  12050644992  208601088  487817216    5238784   

   /run/user/1000  
0       101675008  
1       101675008  
2       101675008  
3       101675008  
4       101675008  
                       time            /      /boot       /run  /run/lock  \
0  2020-02-24T08:36:47.061Z  12051570688  208601088  487944192    5238784   
1  2020-02-24T08:36:57.061Z  12051558400  208601088  487940096    5238784   
2   2020-02-24T08:37:07.06Z  12051542016  208601088  487940096    5238784   
3   2020-02-24T08:37:17.06Z  12051517440  208601088  487940096    5238784   
4  2020-02-2

4       101675008  
                       time            /      /boot       /run  /run/lock  \
0  2020-02-23T08:32:22.444Z  12055851008  208601088  494612480    5238784   
1  2020-02-23T08:32:32.444Z  12055826432  208601088  494608384    5238784   
2  2020-02-23T08:32:42.444Z  12055818240  208601088  494608384    5238784   
3  2020-02-23T08:32:52.444Z  12055797760  208601088  494608384    5238784   
4  2020-02-23T08:33:02.444Z  12055769088  208601088  494608384    5238784   

   /run/user/1000  
0       101675008  
1       101675008  
2       101675008  
3       101675008  
4       101675008  
                       time            /      /boot       /run  /run/lock  \
0  2020-02-23T06:13:42.444Z  12059545600  208601088  501448704    5238784   
1  2020-02-23T06:13:52.444Z  12059537408  208601088  501444608    5238784   
2  2020-02-23T06:14:02.444Z  12059512832  208601088  501444608    5238784   
3  2020-02-23T06:14:12.444Z  12059484160  208601088  501444608    5238784   
4  2020-02-2

Unnamed: 0,gateway1,gateway2
0,12054300000.0,12052940000.0
1,12048240000.0,12048430000.0
2,12055990000.0,12054650000.0
3,12058620000.0,12057260000.0


# Analysing Available Memory

This metric does not require calculating differences

In [470]:
# Available Memory functions applied on dataframes

# Calculate diffs between rows in the loaded dataframes
def calculateMemDiffFromTotal(df, scenario, key, i):
    pass

# Calculate Disk Write load (not relevant)
def calMemSeries(df, scenario, key, i):
    pass
    
# Gather Disk write series
def gatherMemSeries(df, scenario, key, i, *args):
    combined_dfs = args[0]
    combined_dfs[scenario][key]["free_space_{}".format(i)] = df["value"]

In [471]:
analyseTotalMetrics(0,5,calculateMemDiffFromTotal,calMemSeries,gatherMemSeries)
analyseTotalMetrics(1,5,calculateMemDiffFromTotal,calMemSeries,gatherMemSeries)

Unnamed: 0,gateway1,gateway2
0,784880900.0,785015900.0
1,790865200.0,784922300.0
2,722545700.0,725166600.0
3,688385900.0,686543200.0


# Analysing Network Received Bytes

This metric requires calculating differences

In [472]:
# Received Network Byte functions applied on dataframes
# Extract data from wlan0

# Calculate diffs between rows in the loaded dataframes
def calculateNetRecvDiffFromTotal(df, scenario, key, i):
    diffDF = df.drop(["time"], axis = 1).diff()
    df.iloc[:,1:]=diffDF

# Calculate Disk Write load (not relevant)
def calNetRecvSeries(df, scenario, key, i):
    pass
    
# Gather Disk write series
def gatherNetRecvSeries(df, scenario, key, i, *args):
    combined_dfs = args[0]
    combined_dfs[scenario][key]["NetRecv_{}".format(i)] = df["wlan0"]

In [473]:
analyseTotalMetrics(0,7,calculateNetRecvDiffFromTotal,calNetRecvSeries,gatherNetRecvSeries)
analyseTotalMetrics(1,7,calculateNetRecvDiffFromTotal,calNetRecvSeries,gatherNetRecvSeries)

Unnamed: 0,gateway1,gateway2
0,9189.032925,7302.916667
1,8737.384615,11879.667978
2,17246.685533,16031.810096
3,36592.771635,38357.142264


# Analysing Network Sent Bytes

This metric requires calculating differences. 

In [474]:
# Sent Network Byte functions applied on dataframes
# Extract data from wlan0

# Calculate diffs between rows in the loaded dataframes
def calculateNetSentDiffFromTotal(df, scenario, key, i):
    diffDF = df.drop(["time"], axis = 1).diff()
    df.iloc[:,1:]=diffDF

# Calculate Disk Write load (not relevant)
def calNetSentSeries(df, scenario, key, i):
    pass
    
# Gather Disk write series
def gatherNetSentSeries(df, scenario, key, i, *args):
    combined_dfs = args[0]
    combined_dfs[scenario][key]["NetRecv_{}".format(i)] = df["wlan0"]

In [475]:
analyseTotalMetrics(0,8,calculateNetSentDiffFromTotal,calNetSentSeries,gatherNetSentSeries)
analyseTotalMetrics(1,8,calculateNetSentDiffFromTotal,calNetSentSeries,gatherNetSentSeries)

Unnamed: 0,gateway1,gateway2
0,19518.25947,19014.951413
1,33667.088942,34288.803395
2,31741.988418,31011.588942
3,67701.209135,67329.248033


# Analysing Battery Information

This metric does not require calculating differences

It works, but not quite what I wanted. I do not need a summary statistic from this dataset. It would be faster to analyse manually.

In [478]:
# Battery level functions applied on dataframes
# Extract data from wlan0

# Calculate diffs between rows in the loaded dataframes
def calculateBattLvDiffFromTotal(df, scenario, key, i):
    pass

# Calculate Disk Write load (not relevant)
def calBattLvSeries(df, scenario, key, i):
    pass
    
# Gather Disk write series
def gatherBattLvSeries(df, scenario, key, i, *args):
    combined_dfs = args[0]
    print(df)
    combined_dfs[scenario][key]["BattLv_{}".format(i)] = df["value"]

In [479]:
analyseTotalMetrics(2,10,calculateBattLvDiffFromTotal,calBattLvSeries,gatherBattLvSeries)

     Unnamed: 0                      time  value
0             1  2020-02-18T03:48:35.789Z     98
1             3  2020-02-18T03:48:45.789Z     98
2             5  2020-02-18T03:48:55.789Z     98
3             7  2020-02-18T03:49:05.789Z     98
4             9  2020-02-18T03:49:15.789Z     98
5            11  2020-02-18T03:49:25.789Z     98
6            13  2020-02-18T03:49:35.789Z     98
7            15  2020-02-18T03:49:45.789Z     98
8            17  2020-02-18T03:49:55.789Z     98
9            19  2020-02-18T03:50:05.789Z     98
10           21  2020-02-18T03:50:15.789Z     98
11           23  2020-02-18T03:50:25.789Z     98
12           25  2020-02-18T03:50:35.789Z     98
13           27  2020-02-18T03:50:45.789Z     98
14           29  2020-02-18T03:50:55.789Z     98
15           31  2020-02-18T03:51:05.789Z     98
16           33  2020-02-18T03:51:15.789Z     98
17           35  2020-02-18T03:51:25.789Z     98
18           37  2020-02-18T03:51:35.789Z     98
19           39  202

FileExistsError: [Errno 17] File exists: '../Processed_Data/Physical-Arrangement-12/Combined/pijuice_battery_level_data'