# This notebook creates multiple datasets for Training a Keras AutoEncoder
---



This notebook will create a .csv for each scenario:
* 1 motor of the 6 motors drives train: 1, 2, 3
  * train1_running_on_drive1.csv
  * train1_running_on_drive5.csv
  * train1_running_on_drive6.csv
  * train2_running_on_drive4.csv
  * train2_running_on_drive5.csv
  * train2_running_on_drive6.csv
  * train3_running_on_drive3.csv
  * train3_running_on_drive4.csv
  * train3_running_on_drive5.csv
* 2 motors of the 6 motors drive train: 1, 2, 3
  * train1_running_on_drive1_and_drive6.csv
  * train1_running_on_drive5_and_drive6.csv
  * train2_running_on_drive4_and_drive5.csv
  * train2_running_on_drive5_and_drive6.csv
  * train3_running_on_drive2_and_drive3.csv
  * train3_running_on_drive3_and_drive4.csv
  * train3_running_on_drive4_and_drive5.csv

The 16 files are intended to be used as input-data for training the Keras AutoEncoder.

This Notesbook becomes useful if there is more data available.

## Preamble

In [2]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib import animation
from collections import Counter
from datetime import datetime, timezone

In [4]:
project_root_dir = Path.cwd() / 'drive' / 'My Drive' / 'applied-data-hackathon' / 'predictive-maintenance'

In [5]:
def getNowTime():
       return int(datetime.now(tz=timezone.utc).timestamp() * 1000)
nowtime = getNowTime()

## Load data

note: number of assigned cannot be more that 1 and at the same time, have number of running to be less than 2. if (n_assigned>1): anomaly if n_running<2

In [6]:
df = pd.read_csv(project_root_dir/"total_data.csv", index_col=[0])

In [7]:
# to get the time in seconds, 
# subtract the last timestamp from the first one, then
# divide the sequence length by the time difference to obtain time duration for each time step
# multiply by 60 to get the time in minutes
time_per_step =  (df["key"].iloc[-1] - df["key"].iloc[0]) / (len(df["key"]) - 1)
# assuming the time_per_step unit is microsecond, the signal is ~8 minutes long
# time_per_second(~5000)*(100000)=500000000us=8.3minutes 
time_min = time_per_step/(60*1e6)
#df["time(m)"] = df.apply(lambda x: x.name*time_min, axis=1)

# Compute Bollinger bands

In [9]:
for k in df[[k for k in df.columns if "torque" in k]].columns:
  df[f"{k}_sma"] = df[k].rolling(50).mean()
  df[f"{k}_std"] = df[k].rolling(50).std()
  df[f"{k}_bollinger_upper"] = df[f"{k}_sma"] + df[f"{k}_std"] * 2   # calculate upper band
  df[f"{k}_bollinger_lower"] = df[f"{k}_sma"] - df[f"{k}_std"] * 2   # calculate lower band


# Dormant state

In [10]:
# check if the motor is running based on rolling window of the velocity value
for k in df[[k for k in df.columns if "velocity" in k]].columns:
  new_key = k.replace("velocity", "running")
  #df[new_key] = (df[k].rolling(50).mean() > 10).astype(int)
  df[new_key] = (df[k] > 0.03194).astype(int) #0.00194

In [11]:
# combine all the running values to a single list
columns = ["drive1_running", "drive2_running", "drive3_running", "drive4_running", "drive5_running", "drive6_running"]
df["running"] = df[columns].values.tolist()

In [12]:
# combine all drive gear poistions to a single list
columns = ["drive1_gear", "drive2_gear", "drive3_gear", "drive4_gear", "drive5_gear", "drive6_gear"]
df["drive_gears"] = df[columns].values.tolist()

In [13]:
# count the number of motors assigned to a particular train
df["n_assigned"] = df["drive_gears"].apply(lambda x: dict(Counter(x)))

In [14]:
# determines number of machines are actually running on which train this time step
# returns a dictionary with key=gear, and value=number of motors running on that gear
def running_on(row):
  running_on = {}
  for idx, val in enumerate(row["running"], start=1):
    if val:
      key = row[f"drive{idx}_gear"]
      # Use dict.get(key, 0) to get the current value of key in dict, 
      # if key is present in dict, and otherwise return 0. 
      # Then, assign dict[key] to 1 plus the result of dict.get().
      # running_on[key] = running_on.get(key, 0) + 1
      running_on.setdefault(key, []).append(f"drive{idx}_gear")

  return running_on

In [15]:

def running_on(row):
  running_on = {}
  for idx, val in enumerate(row["running"], start=1):
    if val:
      key = row[f"drive{idx}_gear"]
      # Use dict.get(key, 0) to get the current value of key in dict, 
      # if key is present in dict, and otherwise return 0. 
      # Then, assign dict[key] to 1 plus the result of dict.get().
      # running_on[key] = running_on.get(key, 0) + 1
      running_on.setdefault(key, []).append(f"drive{idx}_gear")

  return running_on

In [16]:
df["running_on"] = df.apply(lambda x: running_on(x), axis=1)

In [17]:
for drive in range(1,7):
    column_name = f"drive{drive}_time_since_motor_startup"
    df[column_name] = 0.0
    was_running = False
    for index in range(0,len(df)-1):
        is_running = df.at[index+1,f"drive{drive}_running"] 
        time_delta = (df.at[index+1,"key"] - df.at[index,"key"])
        if not is_running and not was_running:
            df.at[index+1,column_name] = 0
        else:
            was_running = is_running
            df.at[index+1,column_name] = df.at[index,column_name] + time_delta

In [18]:
input_data = {}
startup_phase_treshold = 515053 #ms
for train in range(1,4):
    df2 = df
    for index in range(0,len(df2)):
        row = df2.iloc[index]
        for train in row["running_on"]:
            drives = row["running_on"][train]
            drives.sort()
            if not drives is None and len(drives)>0: 
                modelname = "train"+str(int(train))+"_running_on_"+'_and_'.join(drives).replace("_gear","")

                drive_columns = []
                columns = ["key"]
                for drive in drives:
                    #columns.append(drive.replace("_gear","_velocity"))
                    columns.append(drive.replace("_gear","_torque"))
                    columns.append(drive.replace("_gear","_torque_sma"))
                    columns.append(drive.replace("_gear","_torque_std"))
                    
                    if not startup_phase_treshold == 0:
                        columns.append(drive.replace("_gear","_time_since_motor_startup"))
                  
                if not modelname in input_data:
                    print("init: " + modelname)
                    input_data[modelname] = pd.DataFrame(columns=columns)

                new_row = {}
                for index2,column in enumerate(columns):
                    if "_time_since_motor_startup" in column:
                        new_row[column] = min(row[column]/startup_phase_treshold,1)
                    else:
                        new_row[column] = row[column]

                #print(modelname, new_row)
                input_data[modelname]=input_data[modelname].append(new_row,ignore_index=True)
                    

init: train3_running_on_drive2_and_drive3
init: train2_running_on_drive4
init: train1_running_on_drive5
init: train1_running_on_drive5_and_drive6
init: train3_running_on_drive3
init: train1_running_on_drive6
init: train2_running_on_drive4_and_drive5
init: train2_running_on_drive5
init: train3_running_on_drive3_and_drive4
init: train1_running_on_drive1_and_drive6
init: train3_running_on_drive4
init: train1_running_on_drive1
init: train2_running_on_drive5_and_drive6
init: train2_running_on_drive6
init: train3_running_on_drive4_and_drive5
init: train3_running_on_drive5


In [19]:
for filename in input_data:
    with open(f'/content/drive/MyDrive/applied-data-hackathon/predictive-maintenance/cleaned_data3/{filename}.csv', "w") as fout:
        input_data[filename].to_csv(fout, index="key")

In [25]:

with open('/content/drive/MyDrive/applied-data-hackathon/predictive-maintenance/total_data3.csv', "w") as fout:
    df.to_csv(fout, index="key")