# Data Preprocessing Script

The purpose of this script is to decompose the raw dataframe collected from the experiment into dataframes that are suitable for further analysis. Tasks conducted include:
- Break large data frame into smaller chunks corresponding to different experiment run
- Drop unnecessary columns
- Unstack the performance metrics column
- Save new dataframes as CSV for further processing

In [621]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
import os

In [622]:
def extract_from_raw(path_raw_data, path_export_datadir, multi_indexes):
    df = pd.read_csv(path_raw_data)
    df.info()
    df.head()
    # Remove unused columns
    df = df.drop(["name", "job", "__name__"], axis = 1)
    # Find cutoff points between experiments
    cutoffs = list(df[df['time'] == "time"].index)
    cutoffs.insert(0,0)
    cutoffs.append(len(df.index))
    cutoffs
    # Find the deltas between cutoff points
    deltas = [cutoffs[i] - cutoffs[i-1] for i in range(1, len(cutoffs))]
    deltas
    # Loop through repetitions, generate dataframes for each
    for i in range(1, len(cutoffs)):
        if i == 1:
            sub_df = df[cutoffs[i-1]:cutoffs[i]]
        else:
            sub_df = df[cutoffs[i-1]+1:cutoffs[i]]
        # Break the data frame into smaller frames corresponding to instances
        instance_dfs = []
        print('{}{}'.format(i, sub_df["instance"].unique()))
        for instance in sub_df["instance"].unique():
            temp_df = sub_df[sub_df["instance"] == instance].drop("instance", axis = 1)
            # Drop the port name
            temp_df.instance_name = instance.split(":")[0]
            instance_dfs.append(temp_df)
        # Generate unstacked data frame containing system metrics and export to CSV
        for instance_df in instance_dfs:
            if len(multi_indexes) > 1:
                # Generate multi-index
                tuples = list(zip(*[instance_df[index] for index in multi_indexes]))
                index = pd.MultiIndex.from_tuples(tuples, names=multi_indexes)
                # Generate multi-indexed series
                s = pd.Series(list(instance_df["value"]), index = index)
                # Unstack
                unstacked = s.unstack()
                # Write to CSV
                unstacked.to_csv('{}{}_{}.csv'.format(path_export_datadir, i-1, instance_df.instance_name))
            else:
                instance_df.to_csv('{}{}_{}.csv'.format(path_export_datadir, i-1, instance_df.instance_name))

In [623]:
# Define paths
arrs = ["11", "21", "12"]
subjects = ["1", "2", "3", "4"]
measurements = [
    "node_cpu_seconds_total_data",
    "node_disk_read_bytes_total_data",
    "node_disk_written_bytes_total_data",
    "node_filesystem_free_bytes_data",
    "node_filesystem_size_bytes_data",
    "node_memory_MemAvailable_bytes_data",
    "node_memory_MemTotal_bytes_data",
    "node_network_receive_bytes_total_data",
    "node_network_transmit_bytes_total_data",
    "pijuice_battery_current_data",
    "pijuice_battery_level_data",
    "pijuice_battery_temperature_data",
    "pijuice_battery_voltage_data",
    "pijuice_io_current_data",
    "pijuice_io_voltage_data"
]
multi_indexes_list = [
    ["time", "cpu", "mode"],
    ["time", "device"],
    ["time", "device"],
    ["time", "mountpoint"],
    ["time", "mountpoint"],
    ["time"],
    ["time"],
    ["time", "device"],
    ["time", "device"],
    ["time"],
    ["time"],
    ["time"],
    ["time"],
    ["time"],
    ["time"]    
]

# path_raw_data = "../Raw_Data/Physical-Arrangement-{}/SUB-{}/csv_data/{}/mainflux-influxdb_phy{}_sub{}.csv".format(arr, subject, measurement,arr, subject)
# path_export_datadir = "../Processed_Data/Physical-Arrangement-{}/SUB-{}/{}/".format(arr, subject, measurement)

In [624]:
for arr in arrs:
    for subject in subjects:
        for i in range(len(measurements)):
            measurement = measurements[i]
            multi_indexes = multi_indexes_list[i]
            path_raw_data = "../Raw_Data/Physical-Arrangement-{}/SUB-{}/csv_data/{}/mainflux-influxdb_phy{}_sub{}.csv".format(arr, subject, measurement,arr, subject)
            path_export_datadir = "../Processed_Data/Physical-Arrangement-{}/SUB-{}/{}/".format(arr, subject, measurement)
            os.makedirs(path_export_datadir)
            print("Processing: {}".format(path_export_datadir))
            extract_from_raw(path_raw_data, path_export_datadir, multi_indexes)

Processing: ../Processed_Data/Physical-Arrangement-11/SUB-1/node_cpu_seconds_total_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33548 entries, 0 to 33547
Data columns (total 8 columns):
time        33548 non-null object
__name__    33548 non-null object
cpu         33548 non-null object
instance    33548 non-null object
job         33548 non-null object
mode        33548 non-null object
name        33548 non-null object
value       33548 non-null object
dtypes: object(8)
memory usage: 2.0+ MB
1['gateway2:9100' 'gateway1:9100']
2['gateway2:9100' 'gateway1:9100']
3['gateway2:9100' 'gateway1:9100']
4['gateway2:9100' 'gateway1:9100']
5['gateway1:9100' 'gateway2:9100']
6['gateway1:9100' 'gateway2:9100']
7['gateway1:9100' 'gateway2:9100']
8['gateway1:9100' 'gateway2:9100']
9['gateway1:9100' 'gateway2:9100']
10['gateway2:9100' 'gateway1:9100']
11['gateway2:9100' 'gateway1:9100']
12['gateway2:9100' 'gateway1:9100']
13['gateway1:9100' 'gateway2:9100']
Processing: ../Processed_Data/Ph

13['gateway1:8080' 'gateway2:8080']
Processing: ../Processed_Data/Physical-Arrangement-11/SUB-1/pijuice_battery_level_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 6 columns):
time        1104 non-null object
__name__    1104 non-null object
instance    1104 non-null object
job         1104 non-null object
name        1104 non-null object
value       1104 non-null object
dtypes: object(6)
memory usage: 51.8+ KB
1['gateway2:8080' 'gateway1:8080']
2['gateway2:8080' 'gateway1:8080']
3['gateway2:8080' 'gateway1:8080']
4['gateway2:8080' 'gateway1:8080']
5['gateway1:8080' 'gateway2:8080']
6['gateway2:8080' 'gateway1:8080']
7['gateway1:8080' 'gateway2:8080']
8['gateway2:8080' 'gateway1:8080']
9['gateway1:8080' 'gateway2:8080']
10['gateway2:8080' 'gateway1:8080']
11['gateway2:8080' 'gateway1:8080']
12['gateway2:8080' 'gateway1:8080']
13['gateway1:8080' 'gateway2:8080']
Processing: ../Processed_Data/Physical-Arrangement-11/SUB-1/pijuice_batt

Processing: ../Processed_Data/Physical-Arrangement-11/SUB-2/node_filesystem_size_bytes_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5252 entries, 0 to 5251
Data columns (total 9 columns):
time          5252 non-null object
__name__      5252 non-null object
device        5252 non-null object
fstype        5252 non-null object
instance      5252 non-null object
job           5252 non-null object
mountpoint    5252 non-null object
name          5252 non-null object
value         5252 non-null object
dtypes: object(9)
memory usage: 369.4+ KB
1['gateway2:9100' 'gateway1:9100']
2['gateway1:9100' 'gateway2:9100']
3['gateway1:9100' 'gateway2:9100']
4['gateway2:9100' 'gateway1:9100']
5['gateway1:9100' 'gateway2:9100']
6['gateway2:9100' 'gateway1:9100']
7['gateway2:9100' 'gateway1:9100']
8['gateway1:9100' 'gateway2:9100']
9['gateway1:9100' 'gateway2:9100']
10['gateway1:9100' 'gateway2:9100']
11['gateway1:9100' 'gateway2:9100']
12['gateway1:9100' 'gateway2:9100']
13['gateway1:9100' 'g

12['gateway2:8080' 'gateway1:8080']
13['gateway1:8080' 'gateway2:8080']
Processing: ../Processed_Data/Physical-Arrangement-11/SUB-2/pijuice_io_voltage_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1099 entries, 0 to 1098
Data columns (total 6 columns):
time        1099 non-null object
__name__    1099 non-null object
instance    1099 non-null object
job         1099 non-null object
name        1099 non-null object
value       1099 non-null object
dtypes: object(6)
memory usage: 51.6+ KB
1['gateway2:8080' 'gateway1:8080']
2['gateway1:8080' 'gateway2:8080']
3['gateway1:8080' 'gateway2:8080']
4['gateway1:8080' 'gateway2:8080']
5['gateway1:8080' 'gateway2:8080']
6['gateway2:8080' 'gateway1:8080']
7['gateway2:8080' 'gateway1:8080']
8['gateway2:8080' 'gateway1:8080']
9['gateway2:8080' 'gateway1:8080']
10['gateway1:8080' 'gateway2:8080']
11['gateway1:8080' 'gateway2:8080']
12['gateway2:8080' 'gateway1:8080']
13['gateway1:8080' 'gateway2:8080']
Processing: ../Processed_Data/Physical-

11['gateway1:9100' 'gateway2:9100']
12['gateway1:9100' 'gateway2:9100']
Processing: ../Processed_Data/Physical-Arrangement-11/SUB-3/pijuice_battery_current_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1033 entries, 0 to 1032
Data columns (total 6 columns):
time        1033 non-null object
__name__    1033 non-null object
instance    1033 non-null object
job         1033 non-null object
name        1033 non-null object
value       1033 non-null object
dtypes: object(6)
memory usage: 48.5+ KB
1['gateway2:8080' 'gateway1:8080']
2['gateway2:8080' 'gateway1:8080']
3['gateway2:8080' 'gateway1:8080']
4['gateway2:8080' 'gateway1:8080']
5['gateway2:8080' 'gateway1:8080']
6['gateway1:8080' 'gateway2:8080']
7['gateway2:8080' 'gateway1:8080']
8['gateway1:8080' 'gateway2:8080']
9['gateway1:8080' 'gateway2:8080']
10['gateway1:8080' 'gateway2:8080']
11['gateway1:8080' 'gateway2:8080']
12['gateway1:8080' 'gateway2:8080']
Processing: ../Processed_Data/Physical-Arrangement-11/SUB-3/pijuice_ba

5['gateway2:9100' 'gateway1:9100']
6['gateway1:9100' 'gateway2:9100']
7['gateway2:9100' 'gateway1:9100']
8['gateway1:9100' 'gateway2:9100']
9['gateway2:9100' 'gateway1:9100']
10['gateway2:9100' 'gateway1:9100']
11['gateway2:9100' 'gateway1:9100']
12['gateway2:9100' 'gateway1:9100']
13['gateway2:9100' 'gateway1:9100']
Processing: ../Processed_Data/Physical-Arrangement-11/SUB-4/node_filesystem_size_bytes_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5267 entries, 0 to 5266
Data columns (total 9 columns):
time          5267 non-null object
__name__      5267 non-null object
device        5267 non-null object
fstype        5267 non-null object
instance      5267 non-null object
job           5267 non-null object
mountpoint    5267 non-null object
name          5267 non-null object
value         5267 non-null object
dtypes: object(9)
memory usage: 370.4+ KB
1['gateway1:9100' 'gateway2:9100']
2['gateway1:9100' 'gateway2:9100']
3['gateway2:9100' 'gateway1:9100']
4['gateway2:9100' 'g

9['gateway2:8080' 'gateway1:8080']
10['gateway2:8080' 'gateway1:8080']
11['gateway2:8080' 'gateway1:8080']
12['gateway2:8080' 'gateway1:8080']
13['gateway2:8080' 'gateway1:8080']
Processing: ../Processed_Data/Physical-Arrangement-11/SUB-4/pijuice_io_current_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 6 columns):
time        1104 non-null object
__name__    1104 non-null object
instance    1104 non-null object
job         1104 non-null object
name        1104 non-null object
value       1104 non-null object
dtypes: object(6)
memory usage: 51.8+ KB
1['gateway1:8080' 'gateway2:8080']
2['gateway1:8080' 'gateway2:8080']
3['gateway2:8080' 'gateway1:8080']
4['gateway2:8080' 'gateway1:8080']
5['gateway2:8080' 'gateway1:8080']
6['gateway2:8080' 'gateway1:8080']
7['gateway2:8080' 'gateway1:8080']
8['gateway1:8080' 'gateway2:8080']
9['gateway2:8080' 'gateway1:8080']
10['gateway2:8080' 'gateway1:8080']
11['gateway2:8080' 'gateway1:8080']
12['

4['gateway2:9100' 'gateway1:9100']
5['gateway2:9100' 'gateway1:9100']
6['gateway2:9100' 'gateway1:9100']
7['gateway1:9100' 'gateway2:9100']
8['gateway1:9100' 'gateway2:9100']
9['gateway1:9100' 'gateway2:9100']
10['gateway1:9100' 'gateway2:9100']
11['gateway2:9100' 'gateway1:9100']
12['gateway1:9100' 'gateway2:9100']
13['gateway2:9100' 'gateway1:9100']
Processing: ../Processed_Data/Physical-Arrangement-21/SUB-1/node_network_transmit_bytes_total_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6318 entries, 0 to 6317
Data columns (total 7 columns):
time        6318 non-null object
__name__    6318 non-null object
device      6318 non-null object
instance    6318 non-null object
job         6318 non-null object
name        6318 non-null object
value       6318 non-null object
dtypes: object(7)
memory usage: 345.6+ KB
1['gateway2:9100' 'gateway1:9100']
2['gateway2:9100' 'gateway1:9100']
3['gateway2:9100' 'gateway1:9100']
4['gateway2:9100' 'gateway1:9100']
5['gateway2:9100' 'gateway1

3['gateway2:9100' 'gateway1:9100']
4['gateway1:9100' 'gateway2:9100']
5['gateway2:9100' 'gateway1:9100']
6['gateway1:9100' 'gateway2:9100']
7['gateway2:9100' 'gateway1:9100']
8['gateway2:9100' 'gateway1:9100']
9['gateway2:9100' 'gateway1:9100']
10['gateway1:9100' 'gateway2:9100']
11['gateway1:9100' 'gateway2:9100']
12['gateway1:9100' 'gateway2:9100']
13['gateway1:9100' 'gateway2:9100']
Processing: ../Processed_Data/Physical-Arrangement-21/SUB-2/node_filesystem_free_bytes_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5262 entries, 0 to 5261
Data columns (total 9 columns):
time          5262 non-null object
__name__      5262 non-null object
device        5262 non-null object
fstype        5262 non-null object
instance      5262 non-null object
job           5262 non-null object
mountpoint    5262 non-null object
name          5262 non-null object
value         5262 non-null object
dtypes: object(9)
memory usage: 370.1+ KB
1['gateway2:9100' 'gateway1:9100']
2['gateway2:9100' 'g

6['gateway1:8080' 'gateway2:8080']
7['gateway2:8080' 'gateway1:8080']
8['gateway2:8080' 'gateway1:8080']
9['gateway2:8080' 'gateway1:8080']
10['gateway2:8080' 'gateway1:8080']
11['gateway1:8080' 'gateway2:8080']
12['gateway1:8080' 'gateway2:8080']
13['gateway2:8080' 'gateway1:8080']
Processing: ../Processed_Data/Physical-Arrangement-21/SUB-2/pijuice_io_current_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1099 entries, 0 to 1098
Data columns (total 6 columns):
time        1099 non-null object
__name__    1099 non-null object
instance    1099 non-null object
job         1099 non-null object
name        1099 non-null object
value       1099 non-null object
dtypes: object(6)
memory usage: 51.6+ KB
1['gateway2:8080' 'gateway1:8080']
2['gateway2:8080' 'gateway1:8080']
3['gateway2:8080' 'gateway1:8080']
4['gateway1:8080' 'gateway2:8080']
5['gateway2:8080' 'gateway1:8080']
6['gateway1:8080' 'gateway2:8080']
7['gateway2:8080' 'gateway1:8080']
8['gateway2:8080' 'gateway1:8080']
9['gat

11['gateway1:9100' 'gateway2:9100']
12['gateway2:9100' 'gateway1:9100']
13['gateway1:9100' 'gateway2:9100']
Processing: ../Processed_Data/Physical-Arrangement-21/SUB-3/node_network_transmit_bytes_total_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6294 entries, 0 to 6293
Data columns (total 7 columns):
time        6294 non-null object
__name__    6294 non-null object
device      6294 non-null object
instance    6294 non-null object
job         6294 non-null object
name        6294 non-null object
value       6294 non-null object
dtypes: object(7)
memory usage: 344.3+ KB
1['gateway1:9100' 'gateway2:9100']
2['gateway1:9100' 'gateway2:9100']
3['gateway2:9100' 'gateway1:9100']
4['gateway2:9100' 'gateway1:9100']
5['gateway1:9100' 'gateway2:9100']
6['gateway2:9100' 'gateway1:9100']
7['gateway2:9100' 'gateway1:9100']
8['gateway1:9100' 'gateway2:9100']
9['gateway1:9100' 'gateway2:9100']
10['gateway1:9100' 'gateway2:9100']
11['gateway1:9100' 'gateway2:9100']
12['gateway2:9100' 'gatewa

9['gateway2:9100' 'gateway1:9100']
10['gateway1:9100' 'gateway2:9100']
11['gateway2:9100' 'gateway1:9100']
12['gateway2:9100' 'gateway1:9100']
13['gateway2:9100' 'gateway1:9100']
Processing: ../Processed_Data/Physical-Arrangement-21/SUB-4/node_filesystem_free_bytes_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5267 entries, 0 to 5266
Data columns (total 9 columns):
time          5267 non-null object
__name__      5267 non-null object
device        5267 non-null object
fstype        5267 non-null object
instance      5267 non-null object
job           5267 non-null object
mountpoint    5267 non-null object
name          5267 non-null object
value         5267 non-null object
dtypes: object(9)
memory usage: 370.4+ KB
1['gateway2:9100' 'gateway1:9100']
2['gateway2:9100' 'gateway1:9100']
3['gateway1:9100' 'gateway2:9100']
4['gateway1:9100' 'gateway2:9100']
5['gateway2:9100' 'gateway1:9100']
6['gateway1:9100' 'gateway2:9100']
7['gateway1:9100' 'gateway2:9100']
8['gateway1:9100' 'g

10['gateway1:8080' 'gateway2:8080']
11['gateway2:8080' 'gateway1:8080']
12['gateway2:8080' 'gateway1:8080']
13['gateway1:8080' 'gateway2:8080']
Processing: ../Processed_Data/Physical-Arrangement-21/SUB-4/pijuice_battery_voltage_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103 entries, 0 to 1102
Data columns (total 6 columns):
time        1103 non-null object
__name__    1103 non-null object
instance    1103 non-null object
job         1103 non-null object
name        1103 non-null object
value       1103 non-null object
dtypes: object(6)
memory usage: 51.8+ KB
1['gateway2:8080' 'gateway1:8080']
2['gateway2:8080' 'gateway1:8080']
3['gateway2:8080' 'gateway1:8080']
4['gateway2:8080' 'gateway1:8080']
5['gateway2:8080' 'gateway1:8080']
6['gateway2:8080' 'gateway1:8080']
7['gateway1:8080' 'gateway2:8080']
8['gateway1:8080' 'gateway2:8080']
9['gateway2:8080' 'gateway1:8080']
10['gateway1:8080' 'gateway2:8080']
11['gateway2:8080' 'gateway1:8080']
12['gateway2:8080' 'gateway1:8080'

Processing: ../Processed_Data/Physical-Arrangement-12/SUB-2/node_disk_read_bytes_total_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113 entries, 0 to 1112
Data columns (total 7 columns):
time        1113 non-null object
__name__    1113 non-null object
device      1113 non-null object
instance    1113 non-null object
job         1113 non-null object
name        1113 non-null object
value       1113 non-null int64
dtypes: int64(1), object(6)
memory usage: 60.9+ KB
1['gateway1:9100' 'gateway2:9100']
Processing: ../Processed_Data/Physical-Arrangement-12/SUB-2/node_disk_written_bytes_total_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113 entries, 0 to 1112
Data columns (total 7 columns):
time        1113 non-null object
__name__    1113 non-null object
device      1113 non-null object
instance    1113 non-null object
job         1113 non-null object
name        1113 non-null object
value       1113 non-null int64
dtypes: int64(1), object(6)
memory usage: 60.9+ KB
1[

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850 entries, 0 to 1849
Data columns (total 9 columns):
time          1850 non-null object
__name__      1850 non-null object
device        1850 non-null object
fstype        1850 non-null object
instance      1850 non-null object
job           1850 non-null object
mountpoint    1850 non-null object
name          1850 non-null object
value         1850 non-null int64
dtypes: int64(1), object(8)
memory usage: 130.2+ KB
1['gateway2:9100' 'gateway1:9100']
Processing: ../Processed_Data/Physical-Arrangement-12/SUB-3/node_filesystem_size_bytes_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850 entries, 0 to 1849
Data columns (total 9 columns):
time          1850 non-null object
__name__      1850 non-null object
device        1850 non-null object
fstype        1850 non-null object
instance      1850 non-null object
job           1850 non-null object
mountpoint    1850 non-null object
name          1850 non-null object
value         

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2590 entries, 0 to 2589
Data columns (total 7 columns):
time        2590 non-null object
__name__    2590 non-null object
device      2590 non-null object
instance    2590 non-null object
job         2590 non-null object
name        2590 non-null object
value       2590 non-null int64
dtypes: int64(1), object(6)
memory usage: 141.7+ KB
1['gateway1:9100' 'gateway2:9100']
Processing: ../Processed_Data/Physical-Arrangement-12/SUB-4/node_network_transmit_bytes_total_data/
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2590 entries, 0 to 2589
Data columns (total 7 columns):
time        2590 non-null object
__name__    2590 non-null object
device      2590 non-null object
instance    2590 non-null object
job         2590 non-null object
name        2590 non-null object
value       2590 non-null int64
dtypes: int64(1), object(6)
memory usage: 141.7+ KB
1['gateway1:9100' 'gateway2:9100']
Processing: ../Processed_Data/Physical-Arrangement-12/

In [597]:
## These code are for testing and development


# Define paths
arr = "11"
subject = "1"
measurement = "node_filesystem_free_bytes_data"
path_raw_data = "../Raw_Data/Physical-Arrangement-{}/SUB-{}/csv_data/{}/mainflux-influxdb_phy{}_sub{}.csv".format(arr, subject, measurement,arr, subject)
path_export_datadir = "../Processed_Data/Physical-Arrangement-{}/SUB-{}/{}/".format(arr, subject, measurement)
multi_indexes = ["time", "mountpoint"]
os.makedirs(path_export_datadir)

FileExistsError: [Errno 17] File exists: '../Processed_Data/Physical-Arrangement-11/SUB-1/node_filesystem_free_bytes_data/'

In [598]:
df = pd.read_csv(path_raw_data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4396 entries, 0 to 4395
Data columns (total 9 columns):
time          4396 non-null object
__name__      4396 non-null object
device        4396 non-null object
fstype        4396 non-null object
instance      4396 non-null object
job           4396 non-null object
mountpoint    4396 non-null object
name          4396 non-null object
value         4396 non-null object
dtypes: object(9)
memory usage: 309.2+ KB


In [599]:
df.head()

Unnamed: 0,time,__name__,device,fstype,instance,job,mountpoint,name,value
0,2020-02-24T12:29:22.444Z,node_filesystem_free_bytes,/dev/mmcblk0p1,vfat,gateway2:9100,prometheus,/boot,gateway2_cpu,208601088
1,2020-02-24T12:29:22.444Z,node_filesystem_free_bytes,/dev/root,ext4,gateway2:9100,prometheus,/,gateway2_cpu,12046438400
2,2020-02-24T12:29:22.444Z,node_filesystem_free_bytes,tmpfs,tmpfs,gateway2:9100,prometheus,/run,gateway2_cpu,468467712
3,2020-02-24T12:29:22.444Z,node_filesystem_free_bytes,tmpfs,tmpfs,gateway2:9100,prometheus,/run/lock,gateway2_cpu,5238784
4,2020-02-24T12:29:22.444Z,node_filesystem_free_bytes,tmpfs,tmpfs,gateway2:9100,prometheus,/run/user/1000,gateway2_cpu,101675008


In [600]:
# Remove unused columns
df = df.drop(["name", "job", "__name__"], axis = 1)

In [601]:
# Find cutoff points between experiments
cutoffs = list(df[df['time'] == "time"].index)
cutoffs.insert(0,0)
cutoffs.append(len(df.index))
cutoffs

[0, 511, 799, 1315, 1831, 2114, 2398, 2681, 2970, 3253, 3537, 3825, 4113, 4396]

In [602]:
# Find the deltas between cutoff points
deltas = [cutoffs[i] - cutoffs[i-1] for i in range(1, len(cutoffs))]
deltas

[511, 288, 516, 516, 283, 284, 283, 289, 283, 284, 288, 288, 283]

In [603]:
# Loop through repetitions, generate dataframes for each
for i in range(1, len(cutoffs)):
    if i == 1:
        sub_df = df[cutoffs[i-1]:cutoffs[i]]
    else:
        sub_df = df[cutoffs[i-1]+1:cutoffs[i]]
    # Break the data frame into smaller frames corresponding to instances
    instance_dfs = []
    print('{}{}'.format(i, sub_df["instance"].unique()))
    for instance in sub_df["instance"].unique():
        temp_df = sub_df[sub_df["instance"] == instance].drop("instance", axis = 1)
        temp_df.instance_name = instance
        instance_dfs.append(temp_df)
    # Generate unstacked data frame containing system metrics and export to CSV
    for instance_df in instance_dfs:
#         print(instance_df.head(20))
        if len(multi_indexes) > 1:
            # Generate multi-index
            tuples = list(zip(*[instance_df[index] for index in multi_indexes]))
            index = pd.MultiIndex.from_tuples(tuples, names=multi_indexes)
            # Generate multi-indexed series
            s = pd.Series(list(instance_df["value"]), index = index)
            print(s.head(20))
            # Unstack
            unstacked = s.unstack()
            print(unstacked.head())
            # Write to CSV
#             unstacked.to_csv('{}{}_{}.csv'.format(path_export_datadir, i-1, instance_df.instance_name))
        else:
            instance_df.to_csv('{}{}_{}.csv'.format(path_export_datadir, i-1, instance_df.instance_name))

1['gateway2:9100' 'gateway1:9100']
time                      mountpoint    
2020-02-24T12:29:22.444Z  /boot               208601088
                          /                 12046438400
                          /run                468467712
                          /run/lock             5238784
                          /run/user/1000      101675008
2020-02-24T12:29:32.444Z  /boot               208601088
                          /                 12046438400
                          /run                468463616
                          /run/lock             5238784
                          /run/user/1000      101675008
2020-02-24T12:29:42.444Z  /boot               208601088
                          /                 12046438400
                          /run                468463616
                          /run/lock             5238784
                          /run/user/1000      101675008
2020-02-24T12:29:52.444Z  /boot               208601088
                          / 