In [26]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import gc

import time
from datetime import datetime
from datetime import date

from pathlib import Path

In [27]:
# Get all files from all complete_results folders
def get_file_list(dirName):
    # create a list of file and sub directories
    # names in the given directory
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory
        if os.path.isdir(fullPath):
            allFiles = allFiles + get_file_list(fullPath)
        else:
            allFiles.append(fullPath)

    return allFiles

In [28]:
# data_path = 'E:/source/repos/robot_dataset'
data_path = 'C:/Users/au614889/PycharmProjects/robot_dataset'
dirName = data_path + '/complete_results'

total_samples = 1767
normal_samples = 1245
extra_assembly_samples = 181
damaged_screw_samples = 155
missing_screw_samples = 185
damaged_thread_hole_samples = 1
loosen_samples = 5000

# Get the list of all files in directory tree at given path
files_list = get_file_list(dirName)
files_list = [x for x in files_list if '.h5' not in x]

# Print the files
for elem in files_list:
    print(elem)

C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\horizontal_normal\1_2020-11-10_13-01-52.csv
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\1_2020-11-11_14-44-31.csv
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\1_2020-11-13_13-59-46.csv
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\2_2020-11-11_14-57-07.csv
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\2_2020-11-13_14-03-46.csv
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\3_2020-11-12_15-21-13.csv
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\3_2020-11-13_14-10-38.csv
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\4_2020-11-12_16-05-57.csv
C:/Users/au614889/PycharmProjects/robot_dataset/complete_r

In [29]:
def label_samples(data, mode):
    # Change the unscrewing samples(odd numbers) to their own label or anomaly
    if mode == 'partial':
        data.loc[data['sample_nr'] % 2 != 0, ['label']]  = 5
    elif mode == 'relabel_full':
        data['label_shifted'] = data['label'].shift(-1)
        data['label'] = np.where(data['label'] < data['label_shifted'],
                                          data['label_shifted'],
                                          data['label'])
        complete_data = data.drop(['label_shifted'], axis=1)
        complete_data.fillna(0)
    elif mode == 'full':
        data['sample_shifted'] = data['sample_nr'] - 1
        data['sample_nr'] = np.where(data['sample_nr'] % 2 == 0,
                                          data['sample_shifted'],
                                          data['sample_nr'])
        data = data.drop(['sample_shifted'], axis=1)

        # Renumber samples
        data['sample_nr'] = (data['sample_nr'] !=
                                      data['sample_nr'].shift(1)).astype(int).cumsum()
    data.sort_index()

    return data

In [30]:
def smooth_samples(data):
    # Smooth all the labels in a sample
    data['label'] = data.groupby(['sample_nr'])['label'].transform(
        lambda x: int(np.round(x.mean(), decimals=0))
    )

    return data

In [31]:
def amount_to_take(target, taken, n_samples):
    if target - taken < 0:
        return 0
    elif target - taken <= n_samples:
        return target - taken
    else:
        return n_samples

In [32]:
def change_data_types(data):
    # Change dtypes to smaller
    for column in list(data):
        if data[column].dtype == 'int64' or data[column].dtype == 'int32':
            data[column] = pd.to_numeric(data[column], errors='coerce', downcast='integer')
        if data[column].dtype == 'float64':
            data[column] = pd.to_numeric(data[column], errors='coerce', downcast='float')

    return data

In [33]:
def load_data(normal_percentage=1, assembly_percentage=1, damaged_percentage=1, missing_percentage=1, hole_percentage=1,
              label_type='partial', data_type='continous', files_list=None, drop_columns=False):
    dataframe_list = []
    # Number of taken samples
    taken_0 = 0
    taken_1 = 0
    taken_2 = 0
    taken_3 = 0
    taken_4 = 0
    taken_5 = 0

    # Target number of samples
    target_0 = int(normal_percentage * normal_samples)
    target_1 = int(assembly_percentage * extra_assembly_samples)
    target_2 = int(damaged_percentage * damaged_screw_samples)
    target_3 = int(missing_percentage * missing_screw_samples)
    target_4 = int(hole_percentage * damaged_thread_hole_samples)
    target_5 = int(1 * loosen_samples)

    for file in files_list:
        print(file)
        # Reading the file content to a DataFrame
        dfn = pd.read_csv(file, sep=',')

        # Number the samples
        dfn = label_samples(data=dfn, mode=label_type)
        dfn = smooth_samples(data=dfn)
        dfn = dfn.set_index(['sample_nr'])

        # Count the sample types
        count_df = dfn.groupby(['sample_nr'])['label'].median()
        unique, counts = np.unique(count_df, return_counts=True)
        labels_count_dict = {A: B for A, B in zip(unique, counts)}
        print(f'Loaded labels: {labels_count_dict}')
        dataframe_list.append(dfn)

        # if drop_columns:
        #     dfn = dfn.drop(columns=['output_int_register_25',
        #                             'output_int_register_26',
        #                             'output_bit_register_64',
        #                             'output_bit_register_65',
        #                             'output_bit_register_66',
        #                             'output_bit_register_67'], axis=1)
        #
        # # Take only the amount of samples that's needed to fill the requirement
        # sampled_list = []
        # for label in unique:
        #     subindex = list(np.unique(dfn.loc[dfn['label'] == label].index))
        #     to_take = amount_to_take(eval('target_' + str(int(label))), eval('taken_' + str(int(label))), labels_count_dict.get(label))
        #
        #     sample_ids = np.random.choice(subindex, to_take, replace=False)
        #     sampled_df = dfn[dfn.index.isin(sample_ids)]
        #     sampled_list.append(sampled_df)
        #
        #     # Update the taken samples
        #     if label == 0:
        #         taken_0 += to_take
        #     elif label == 1:
        #         taken_1 += to_take
        #     elif label == 2:
        #         taken_2 += to_take
        #     elif label == 3:
        #         taken_3 += to_take
        #     elif label == 4:
        #         taken_4 += to_take
        #     elif label == 5:
        #         taken_5 += to_take
        # print([taken_5, taken_0, taken_1, taken_2, taken_3, taken_4])
        # print('\n')
        #
        # taken_data = change_data_types(pd.concat(sampled_list, ignore_index=False).sort_values(['sample_nr', 'timestamp']))
        # dataframe_list.append(taken_data)
        #
        # if taken_0 >= target_0 and taken_1 >= target_1 and taken_2 >= target_2 and taken_3 >= target_3 and taken_4 >= target_4:
        #     print(f'Finished at {file}')
        #     break

        # # Save the loaded file to hdf5
        # file_path = Path(file)
        # hdf_path = file_path.parents[0] / (file_path.parts[-1][:-4] + '.h5')
        # taken_data.to_hdf(path_or_buf=hdf_path,
        #                   key=file_path.parts[-1][:-4])

    # Concat those dataframes
    complete_data = pd.concat(dataframe_list, ignore_index=False)
    complete_data = complete_data.reset_index()
    # if drop_columns:
    #     complete_data = complete_data.drop(columns=['timestamp'], axis=1)
    # Renumber samples
    complete_data['sample_nr'] = (complete_data['sample_nr'] != complete_data['sample_nr'].shift(1)).astype(int).cumsum()
    del dataframe_list
    gc.collect()

    return complete_data

In [34]:
# Read all files into pandas
complete_data = load_data(normal_percentage=1,
                          assembly_percentage=1,
                          damaged_percentage=1,
                          missing_percentage=1,
                          hole_percentage=1,
                          label_type='partial',
                          files_list=files_list[:])

C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\horizontal_normal\1_2020-11-10_13-01-52.csv
{0: 65, 5: 66}
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\1_2020-11-11_14-44-31.csv
{0: 53, 1: 9, 2: 7, 3: 4, 5: 73}
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\1_2020-11-13_13-59-46.csv
{0: 71, 2: 18, 5: 89}
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\2_2020-11-11_14-57-07.csv
{0: 51, 1: 23, 2: 11, 3: 6, 5: 91}
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\2_2020-11-13_14-03-46.csv
{0: 110, 1: 5, 2: 30, 4: 2, 5: 147}
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\3_2020-11-12_15-21-13.csv
{0: 47, 1: 15, 2: 23, 3: 6, 5: 91}
C:/Users/au614889/PycharmProjects/robot_dataset/complete_results\spiral_damaged_screw_thread\3_2020-11-13_14-10-38.csv
{

In [35]:
# Data statistics
# Number of total samples
print('There are {n_samples} samples in total.'.format(n_samples=max(list(complete_data['sample_nr']))))
# print('There are {n_samples} samples in total.'.format(n_samples=max(list(complete_data.index))/2))

# Count the different types of labels
grouped_df = complete_data.groupby(['label', 'sample_nr'])['output_double_register_25'].count()

unique = list(grouped_df.index.get_level_values(0).unique())
count = []
for id in unique:
    count.append(np.round(len(grouped_df.loc[id, :]) / max(complete_data['sample_nr']) * 100, decimals=2))

# count_dict = {A: B for A, B in zip(unique, np.round((count / max(complete_data['sample_nr'])) * 100, decimals=2))}
count_dict = {unique[i]: count[i] for i in range(len(unique))}

print('The types and counts of different labels as percentage of total data: \n {count_dict}'.format(count_dict=count_dict))

There are 4094 samples in total.
The types and counts of different labels as percentage of total data: 
 {0: 34.68, 1: 5.4, 2: 4.47, 3: 5.32, 4: 0.07, 5: 50.05}


In [36]:
# Save the data
complete_data.to_hdf('C:/Users/au614889/PycharmProjects/robot_dataset/created_dataset/robot_data.h5',
                     key='complete_data')

In [37]:
%matplotlib qt

plot_data = complete_data[:]

plt.plot(plot_data['label'])
plt.plot(plot_data['sample_nr'] / 100)
plt.plot(plot_data['output_double_register_25'])

plt.show()

In [38]:
# experiment_data = complete_data[:]
#
# experiment_data['event'] = experiment_data.index
# experiment_data = experiment_data.set_index(['sample_nr', 'event'])
# experiment_data = experiment_data.reset_index('event', drop=True)
# experiment_data = experiment_data.set_index(experiment_data.groupby(level=0).cumcount().rename('event'), append=True)
# # experiment_data = experiment_data.drop(columns=['event'], axis=1)
# max_rows = experiment_data.index.get_level_values(1).max()
#
# print(experiment_data)

In [39]:
# from tqdm.notebook import tqdm as tqdm
#
# def pad_df(df):
#     # 1. compute the sizes of each sample_nr
#     sr_sizes = df.groupby(df.index.get_level_values(0)).size()
#     # compute max size and #sample_nr
#     max_size = sr_sizes.max()
#     n_sample_nrs = len(sr_sizes)
#
#     # 2. preallocate the output array and fill
#     arr = np.zeros((max_size * n_sample_nrs, 126))
#     idx_lv0 = df.index.get_level_values(0)  # get sample_nr
#     for i in tqdm(range(n_sample_nrs)):
#         row = i*max_size
#         arr[row:row + sr_sizes.iloc[i], :] =\
#             df[idx_lv0 == sr_sizes.index[i]].values
#
#     # 3. convert to dataframe
#     df_ans = pd.DataFrame(
#         data=arr,
#         index=pd.MultiIndex.from_product([sr_sizes.index, range(max_size)]),
#         columns=df.columns
#     ).rename_axis(df.index.names, axis=0)
#
#     return df_ans
#
# padded_df = pad_df(experiment_data)
# print(padded_df)
