Notebook for stiching separate experiment files into one

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import re

import time
from datetime import datetime
from datetime import date

from pathlib import Path

In [32]:
def sort_human(l):
    convert = lambda text: float(text) if text.isdigit() else text
    alphanum = lambda key: [convert(c) for c in re.split('([-+]?[0-9]*\.?[0-9]*)', key)]
    l.sort(key=alphanum)
    return l

In [33]:
# Get the names of all files in the folder
folder_path = Path('../test_results/horiz')
save_path = Path('../complete_results/spiral_damaged_screw_thread')

extra_assembly_label = 1        # Extra assembly component
damaged_screw_label = 2         # Damaged screw thread
missing_screw_label = 3         # Missing screw
damaged_thread_hole_label = 4   # Damaged thread hole

anomaly_label = damaged_screw_label

all_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
all_files = sort_human(all_files)

In [35]:
# Load the files in the folder
# Decide which ones to take
# Here just based on manually imputing numbers
selected_numbers = list(range(42, 44))

# TODO: best would to be to check the date of last complete file, and then load all files newer than that

taken_files = []
anomaly_files = []
dataframe_list = []

for f in all_files:
    split_string = f.split('_')
    number = int(split_string[0])
    if number in selected_numbers and 'csv' in f:
        taken_files.append(f)
    if number in selected_numbers and 'txt' in f:
        anomaly_files.append(f)

for file in taken_files:
    file_path = Path(folder_path) / Path(file)
    # Reading the file content to create a DataFrame
    dfn = pd.read_csv(file_path, sep=' ')
    dataframe_list.append(dfn)

print(taken_files)
print(len(taken_files))

['42_P_to_A_2020-11-17_16-21-18.csv', '43_P_to_A_2020-11-17_16-43-29.csv']
2


In [37]:
anomalies_list = []

# Read the anomaly points
for file in anomaly_files:
    anomaly_path = Path(folder_path) / Path(file)
    if os.path.getsize(anomaly_path) > 0:
        file = open(anomaly_path, 'r')

        anomalies_string = file.read()
        anomalies_string = anomalies_string.replace("[", "")
        anomalies_string = anomalies_string.replace("]", "")

        anomalies = anomalies_string.split(',')
        anomalies = [int(x) for x in anomalies]
        anomalies_list.append(anomalies)
    else:
        anomalies_list.append([])

print(anomalies_list)
print(len(anomalies_list))

[[3, 9, 12, 14, 20, 31, 39, 51, 52, 62], [3, 9, 12, 14, 20, 31, 39, 51, 52, 62]]
2


In [7]:
# Manual fix to some data
# dataframe_list[0] = dataframe_list[0].loc[dataframe_list[0]['output_int_register_25'] != 20]
# dataframe_list[1] = dataframe_list[1].loc[dataframe_list[1]['output_int_register_26'] != 101]
# print(len(dataframe_list[0].index))

In [38]:
def label_anomalies(data, to_anomalies, anomaly_label, from_pins, to_pins, label_loosen=True, debug=False):
    # If there are no labels already, add them
    if 'label' not in data.columns:
        data['label'] = 0
        anomalies_index = []

        if label_loosen:
            # Take only those to_anomalies which are in to_points
            to_anomalies = [x for x in to_anomalies if x in to_pins]
            # Check the index of the anomaly points in the to_sequence
            for anomaly_point in to_anomalies:
                anomalies_index.append(to_pins.index(anomaly_point))

            anomalies_index.sort()

            # Get from_points at these indices
            from_anomalies = [from_pins[i] for i in anomalies_index]

            if debug:
                print(f'from_anomalies: {from_anomalies}')
                # print(to_pins)
                print(f'to_anomalies: {to_anomalies}')
                print(f'anomalies_index: {anomalies_index}')
                pins_dict = {A: B for A, B in zip(from_pins, to_pins)}
                print(pins_dict)
                to_check = data.groupby('output_int_register_25', sort=False).mean()
                from_check = data.groupby('output_int_register_26', sort=False).mean()
                print(f'from_pins: {list(from_check.index)}')
                print(f'to_pins: {list(to_check.index)}')
                print('\n')

            # Label these points as anomalies
            data.loc[data['output_int_register_25'].isin(to_anomalies), "label"] = anomaly_label
            data.loc[data['output_int_register_26'].isin(from_anomalies), "label"] = anomaly_label
        else:
            data.loc[data['output_int_register_25'].isin(to_anomalies), "label"] = anomaly_label
    else:
        data['label'] = np.where(data['label'] == 'normal', 0, anomaly_label)

    return data

In [39]:
labeled_list = []
label_loosen = True

for i, df in enumerate(dataframe_list):
    # Get the sequence from the points
    from_check = df['output_int_register_26'].loc[(df['output_int_register_26'].shift() != df['output_int_register_26'])]
    from_check = from_check.loc[from_check != 0]
    from_pins = list(from_check.values)
    from_pins = [int(x) for x in from_pins]

    to_check = df['output_int_register_25'].loc[(df['output_int_register_25'].shift() != df['output_int_register_25'])]
    to_check = to_check.loc[to_check != 0]
    to_pins = list(to_check.values)
    to_pins = [int(x) for x in to_pins]

    # Check if there's at least one completed correct motion
    if len(to_pins) >= 2 and len(from_pins) >= 2:
        # Delete the last incomplete point, so that we start from home position
        df = df[df['output_int_register_25'] != to_pins[-1]]
        df = df[df['output_int_register_26'] != from_pins[-1]]

        to_pins = to_pins[:]
        from_pins = from_pins[:]

        if len(anomalies_list) > 0:
            send_anomalies = anomalies_list[i]
        else:
            send_anomalies = []

        labeled_df = label_anomalies(data=df, to_anomalies=send_anomalies, from_pins=from_pins, to_pins=to_pins,
                                    anomaly_label=anomaly_label, label_loosen=label_loosen, debug=False)
        labeled_list.append(labeled_df)

In [40]:
# Join the dataframes
if len(labeled_list) > 1:
    labeled_data = pd.concat(labeled_list, ignore_index=True)
else:
    labeled_data = labeled_list[0]

# Number the samples
labeled_data['sample_nr'] = (labeled_data['output_int_register_25'] != labeled_data['output_int_register_25'].shift(1)).astype(int).cumsum()

In [1]:
%matplotlib qt

plt.plot(labeled_data['label'] * 10)
plt.plot(labeled_data['sample_nr'])
plt.plot(labeled_data['output_int_register_25'])
# plt.plot(labeled_data['output_int_register_26'])
plt.plot(labeled_data['output_double_register_25'] * 10)

plt.show()

NameError: name 'plt' is not defined

In [41]:
# Tidy the data
# Delete the label_plot column
if 'label_plot' in labeled_data.columns:
    labeled_data = labeled_data.drop(['label_plot'], axis=1)

# Delete the last sample, as it's just home position
labeled_data = labeled_data[labeled_data['sample_nr'] != max(labeled_data['sample_nr'])]

# Delete wrong samples
wrong_to_samples = []
if len(wrong_to_samples) > 0:
    # wrong_from_samples = [x - 1 for x in wrong_to_samples]
    labeled_data = labeled_data[~labeled_data['sample_nr'].isin(wrong_to_samples)]

    labeled_data = labeled_data.reset_index(drop=True)
    labeled_data.drop(['sample_nr'], axis=1)
    labeled_data['sample_nr'] = (labeled_data['output_int_register_25'] != labeled_data['output_int_register_25'].shift(1)).astype(int).cumsum()

# Manually change specific labels
# Extra assembly anomalies
assembly_anomalies = []
if len(assembly_anomalies) > 0:
    labeled_data.loc[labeled_data['output_int_register_25'].isin(assembly_anomalies), 'label'] = extra_assembly_label

    # Get the from sample numbers
    extra_assembly_samples = np.unique(labeled_data['sample_nr'].loc[labeled_data['label'] == extra_assembly_label])

    # Now get the from_pins associated with these samples
    extra_assembly_from_pins = list(np.unique(labeled_data['output_int_register_26'].loc[labeled_data['sample_nr'].isin(extra_assembly_samples)]))
    extra_assembly_from_pins.remove(0)

    # Label those from_pins
    labeled_data.loc[labeled_data['output_int_register_26'].isin(extra_assembly_from_pins), 'label'] = extra_assembly_label

# Missing screw anomalies
missing_anomalies = []
if len(missing_anomalies) > 0:
    labeled_data.loc[labeled_data['output_int_register_25'].isin(missing_anomalies), 'label'] = missing_screw_label

    # Get the from sample numbers
    missing_screw_samples = np.unique(labeled_data['sample_nr'].loc[labeled_data['label'] == missing_screw_label])

    # Now get the from_pins associated with these samples
    missing_screw_from_pins = list(np.unique(labeled_data['output_int_register_26'].loc[labeled_data['sample_nr'].isin(missing_screw_samples)]))
    missing_screw_from_pins.remove(0)

    # Label those from_pins
    labeled_data.loc[labeled_data['output_int_register_26'].isin(missing_screw_from_pins), 'label'] = missing_screw_label

# Damaged thread hole anomalies
damaged_thread_hole_anomalies = []
if len(damaged_thread_hole_anomalies) > 0:
    labeled_data.loc[labeled_data['output_int_register_25'].isin(damaged_thread_hole_anomalies), 'label'] = damaged_thread_hole_label

    # Get the from sample numbers
    damaged_thread_hole_samples = np.unique(labeled_data['sample_nr'].loc[labeled_data['label'] == damaged_thread_hole_label])
    damaged_thread_hole_samples = [x - 1 for x in damaged_thread_hole_samples]

    # Now get the from_pins associated with these samples
    damaged_thread_hole_from_pins = list(np.unique(labeled_data['output_int_register_26'].loc[labeled_data['sample_nr'].isin(damaged_thread_hole_samples)]))
    damaged_thread_hole_from_pins.remove(0)

    # Label those from_pins
    labeled_data.loc[labeled_data['output_int_register_26'].isin(damaged_thread_hole_from_pins), 'label'] = damaged_thread_hole_label

In [13]:
# Get sample length statistics
df_grouped = labeled_data.groupby(['sample_nr']).count()

print('Mean sample length is {mean_value}.'.format(mean_value=df_grouped['label'].mean()))

print('Longest sample is sample id {s_id} of length {s_len}'.format(s_len=df_grouped['label'].max(),
      s_id=df_grouped['label'].idxmax()))

print('Shortest sample is sample id {s_id} of length {s_len}'.format(s_len=df_grouped['label'].min(),
      s_id=df_grouped['label'].idxmin()))

Mean sample length is 1480.4.
Longest sample is sample id 28 of length 2313
Shortest sample is sample id 21 of length 639


In [43]:
def get_save_filename(folder):
    today = date.today()
    now = datetime.now()
    current_time = now.strftime("%H-%M-%S")
    date_format = str(today) + "_" + str(current_time)
    # get the first number of the folders inside the class results folder
    greatest = 0

    if not os.path.exists(folder):
        os.makedirs(folder)
    result_files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    for f in result_files:
        split_string = f.split('_')
        number = int(split_string[0])
        if number > greatest:
            greatest = number
    filename = str(greatest + 1) + "_" + date_format + ".csv"
    return filename

In [None]:
# Save the file
dir_filename = save_path / Path(get_save_filename(save_path))
labeled_data.to_csv(dir_filename, index=False, sep=",")


In [44]:
# Save the file
dir_filename = save_path / Path(get_save_filename(save_path))
labeled_data.to_csv(dir_filename, index=False, sep=",")
