## Data preprocessing
This is a code file for preprocessing the measured data. During the preprocessing, a sliding window techique is utilized in the calculation of the statistical features.

### Calling all needed libraries:

In [1]:
%pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd
import glob
import os
import re

### Driver code
- *start* = the starting index
- *end* = number of data points
- *slide* = number of data points which the sliding window moves each step
- *win* = size of the sizing window

The number of **overlapping** data points between each window is (win - slide)

In [4]:
def getnums(s,e,i):
   return (np.arange(s,e,i))

start, end, slide = 0, 50000, 10
win = 100

var = getnums(start, end, slide)
var

array([    0,    10,    20, ..., 49970, 49980, 49990])

### Calculating features
Function preprocess:
- *v1* = one column of the data (one signal)
- *cn* = column name
- Calculates 16 statistical features from each window:
    - minimum, maximum, mean, median, standard deviation, variance, skewness, kurtosis, first location of minimum, first location of maximum, last location of minimum, last location of maximum, sum of values, absolute energy, absolute sum of changes, mean absolute sum of changes
    - Features can be easily added or removed if needed

In [6]:
def preprocess(v1,cn):
   
    l1=[];l2=[];l3=[];l4=[];l5=[];l6=[];l7=[];l8=[];l9=[];l10=[];l11=[];l12=[];l13=[];l14=[];l15=[];l16=[];

    for element in var:
        
        # minimum
        a=(v1[element:win+element])
        a=np.min(a)
        
        # maximum
        b=(v1[element:win+element])
        b=np.max(b)
        
        # mean
        c=(v1[element:win+element])
        c=np.mean(c)
        
        # median
        d=(v1[element:win+element])
        d=np.median(d)
        
        # standard_deviation
        e=(v1[element:win+element])
        e=np.std(e)
        
        # variance
        f=(v1[element:win+element])
        f=np.var(f)
        
        # skewness
        g=(v1[element:win+element])
        if not isinstance(g, pd.Series):
            g = pd.Series(g)
        g= pd.Series.skew(g)
        
        # kurtosis
        h=(v1[element:win+element])
        if not isinstance(h, pd.Series):
            h = pd.Series(h)
        h= pd.Series.kurtosis(h)
        
        # first_location_of_minimum
        i=(v1[element:win+element])
        if not isinstance(i, (np.ndarray, pd.Series)):
            i = np.asarray(i)
        i= np.argmin(i) / len(i) if len(i) > 0 else np.NaN
        
        # first_location_of_maximum
        j=(v1[element:win+element])
        if not isinstance(j, (np.ndarray, pd.Series)):
            j = np.asarray(j)
        j= np.argmax(j) / len(j) if len(j) > 0 else np.NaN
        
        # last_location_of_minimum
        k=(v1[element:win+element])
        k = np.asarray(k)
        k = 1.0 - np.argmin(k[::-1]) / len(k) if len(k) > 0 else np.NaN
        
        # last_location_of_maximum
        l=(v1[element:win+element])
        l = np.asarray(l)
        l = 1.0 - np.argmax(l[::-1]) / len(l) if len(l) > 0 else np.NaN
        
        # sum_of_values
        m=(v1[element:win+element])
        m=np.sum(m)
        
        # abs_energy
        n=(v1[element:win+element])
        if not isinstance(n, (np.ndarray, pd.Series)):
            n = np.asarray(n)
        n= np.dot(n, n)
        
        # abs_sum_of_changes
        o=(v1[element:win+element])
        o=np.sum(np.abs(np.diff(o)))
        
        # mean_abs_change
        p=(v1[element:win+element])
        p= np.mean(np.abs(np.diff(p)))

        l1.append(a);l2.append(b);l3.append(c);l4.append(d);l5.append(e);l6.append(f);l7.append(g);l8.append(h);
        l9.append(i);l10.append(j);l11.append(k);l12.append(l);l13.append(m);l14.append(n);l15.append(o);l16.append(p);
        
    l1=pd.Series(l1);l2=pd.Series(l2);l3=pd.Series(l3);l4=pd.Series(l4);l5=pd.Series(l5);l6=pd.Series(l6);l7=pd.Series(l7);l8=pd.Series(l8);l9=pd.Series(l9);
    l10=pd.Series(l10);l11=pd.Series(l11);l12=pd.Series(l12);l13=pd.Series(l13);l14=pd.Series(l14);l15=pd.Series(l15);l16=pd.Series(l16);

    mat=pd.concat([l1,l2,l3,l4,l5,l6,l7,l8,l9,l10,l11,l12,l13,l14,l15,l16],axis=1)
    mat.columns = [cn+'minimum', cn+'maximum', cn+'mean', cn+'median',cn+'standard_deviation',
                   cn+'variance',cn+'skewness',cn+'kurtosis',cn+'first_location_of_minimum',
                   cn+'first_location_of_maximum',cn+'last_location_of_minimum',
                   cn+'last_location_of_maximum',cn+'sum_of_values',cn+'abs_energy',
                   cn+'abs_sum_of_changes',cn+'mean_abs_change']
    return mat

### Calling the preprocess function for each column (v2-v9) with the column names
After the preprocessing, the feature columns are merged to one table again.

In [8]:
def full(df):

    # Check for the correct names (modify according to your data)
    a = preprocess(v2,'U ')
    b = preprocess(v3,'V ')
    c = preprocess(v4,'W ')
    d = preprocess(v5,'MRES ')
    e = preprocess(v6,'MREF ')
    f = preprocess(v7,'PSP ')
    g = preprocess(v8,'RSP ')
    h = preprocess(v9, 'AP ')
    mat_1 = pd.concat([a,b,c,d,e,f,g,h],axis=1)
    return mat_1

### Read data from files
First, define a customized sorting function (for sorting the files).

In [10]:
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

Then, define the path to the folder where the files are located in. Define which files to read.

In [None]:
path='C:/Users/username/Classification/Raw_data' # replace this with your file location

all_files = sorted(glob.glob(path+"/*.csv"),key=numericalSort) # read and sort all the files with an ending ".csv"
all_files

In [13]:
for e in all_files:
    
    df = pd.read_csv(e,delimiter=',') #Read each file to be preprocessed
    
    #Rename columns depending on the signals utilized
    df.columns = ['t', 'U', 'V', 'W', 'MRES', 'MREF', 'PSP','RSP', 'AP']
    
    df.to_csv(e,index=False) #Save each to a csv file

Define labels to save the preprocessed data. (modify according to your data)

In [15]:
labels = ['F1','F2','F3','H','L1','L2','L3','SP1','SP2','SP3','ST1','ST2','ST3']

### Code to implement function full and write all csv files with statistical parameters

In [None]:
save_path = 'C:/Users/username/Classification/Preprocessed_data/' # replace this with your own save location

for i,l in zip(all_files,labels):
    
    df = pd.read_csv(i,delimiter=',')
    
    # Check for the correct column names (modify according to your data)
    df.columns = ['t', 'U', 'V', 'W', 'MRES', 'MREF', 'PSP','RSP', 'AP']
    
    # Create variables for each column
    v1=df["t"]; v2=df["U"]; v3=df["V"]; v4=df["W"]; v5=df["MRES"];
    v6=df["MREF"]; v7=df["PSP"]; v8=df["RSP"]; v9=df["AP"]
    
    df = full(df) # run the function full to implement preprocessing
    df.to_csv(save_path+l+'_win '+str(slide)+' '+str(win)+'.csv',index=False) # Saving the preprocessed data with its fault label "l"
    print('The current file is '+i)

Combine all the data to one file

In [19]:
all_files = sorted(glob.glob(save_path+"*"+str(slide)+" "+str(win)+".csv"),key=numericalSort) # replace slide and win with the correct numbers
all_pp = pd.concat( [pd.read_csv(f) for f in all_files] )
all_pp.to_csv(save_path+"DATA.csv", index=False, encoding='utf-8-sig')

Check to see that the combined data looks right.

In [23]:
# data = pd.read_csv('DATA_combined_win '+str(slide)+' '+str(win)+'.csv')
data = pd.read_csv('Preprocessed_data/DATA.csv')
data

Unnamed: 0,U minimum,U maximum,U mean,U median,U standard_deviation,U variance,U skewness,U kurtosis,U first_location_of_minimum,U first_location_of_maximum,...,AP skewness,AP kurtosis,AP first_location_of_minimum,AP first_location_of_maximum,AP last_location_of_minimum,AP last_location_of_maximum,AP sum_of_values,AP abs_energy,AP abs_sum_of_changes,AP mean_abs_change
0,-1.074003,1.401329,0.107976,0.076986,0.448750,0.201376,0.005352,-0.047177,0.050000,0.750000,...,-0.795464,2.167309,0.260000,0.17,0.27,0.180000,-2.709623e+02,9.189493e+02,141.532681,1.429623
1,-0.950874,1.401329,0.135367,0.097404,0.445064,0.198082,0.206885,-0.041342,0.560000,0.650000,...,-0.762604,2.639673,0.160000,0.07,0.17,0.080000,-2.725766e+02,9.113794e+02,133.285893,1.346322
2,-0.950874,1.401329,0.119473,0.076986,0.461215,0.212720,0.202568,-0.130773,0.460000,0.550000,...,-0.857631,2.992274,0.060000,0.03,0.07,0.040000,-2.815356e+02,9.474619e+02,128.855772,1.301573
3,-0.950874,1.401329,0.167456,0.182239,0.459395,0.211043,0.043600,-0.193077,0.360000,0.450000,...,-0.214387,0.242632,0.290000,0.03,0.30,0.040000,-2.756218e+02,8.846839e+02,117.700292,1.188892
4,-0.950874,1.401329,0.210118,0.238501,0.475631,0.226225,0.050289,-0.244698,0.260000,0.350000,...,-0.204867,0.087845,0.190000,0.90,0.20,0.910000,-2.798503e+02,9.081475e+02,117.883741,1.190745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64995,-6.029806,5.351209,-0.253639,-0.837959,3.871992,14.992320,0.046084,-1.500649,0.823529,0.078431,...,-0.053737,-1.166721,0.980392,0.00,1.00,0.019608,6.330183e+06,7.857104e+11,276.165875,5.523318
64996,-6.029806,4.356964,-1.513797,-2.252815,3.242694,10.515062,0.329254,-1.192621,0.780488,0.000000,...,-0.069384,-1.175247,0.975610,0.00,1.00,0.024390,5.087847e+06,6.313706e+11,222.336944,5.558424
64997,-6.029806,1.522244,-2.953042,-3.285644,2.235113,4.995732,0.440622,-1.077786,0.709677,0.000000,...,-0.071792,-1.176486,0.967742,0.00,1.00,0.032258,3.846054e+06,4.771657e+11,168.854680,5.628489
64998,-6.029806,-2.252815,-4.302275,-4.717388,1.158068,1.341122,0.351677,-1.205240,0.571429,0.000000,...,-0.079587,-1.278470,0.952381,0.00,1.00,0.047619,2.604801e+06,3.230948e+11,114.240417,5.712021
