# Data Preprocessing - Noise Reduction and train-test Split


## Introduction

This code is part of Fuel leak detection and location.

## Imports and Global Definitions

In [1]:
import pandas as pd
import numpy as np
from scipy.signal import butter, lfilter, freqz
from sklearn.model_selection import train_test_split

In [2]:
fs = 50.0       # sample rate, Hz
T = 50.0         # seconds
n = int(T * fs) # total number of samples (2500)

In [3]:
label_colors = ["#F58C41", "#2FAFC6", "#800080", "#AF1946", 
                "#46A5E1", "#522A64", "#A3DB05", "#FC6514"]

In [4]:
def select_columns(data_frame, column_names):
    new_frame = data_frame.loc[:, column_names]
    return new_frame

## Data Loading

In [5]:
#read data file
df= pd.read_csv('PT_Data.csv')
df.head()

Unnamed: 0,File,PT,StartTime,Label,LeakTime,s0,s1,s2,s3,s4,...,s2490,s2491,s2492,s2493,s2494,s2495,s2496,s2497,s2498,s2499
0,57,ASK3746,2017-01-07-04:01:22.140,0,1655,2833,2833,2833,2833,2832,...,1915.0,1915.0,1915.0,1915.0,1913.0,1913.0,1913.0,1914.0,1914.0,1914.0
1,2146,ASH3341,2019-07-12-08:17:59.370,0,1842,1504,1505,1506,1505,1504,...,1506.0,1505.0,1505.0,1505.0,1505.0,1504.0,1505.0,1506.0,1507.0,1507.0
2,2049,ASH3341,2019-06-04-13:24:48.530,0,1409,407,407,407,407,407,...,397.0,396.0,396.0,396.0,396.0,396.0,396.0,396.0,396.0,396.0
3,1478,HFA3286,2017-05-10-19:17:10.980,1,1754,2187,2186,2186,2186,2186,...,2181.0,2182.0,2181.0,2181.0,2182.0,2181.0,2180.0,2181.0,2181.0,2181.0
4,2593,GVT3040,2019-11-07-13:34:07.986,0,1844,1433,1433,1433,1433,1433,...,1421.0,1421.0,1421.0,1421.0,1421.0,1421.0,1421.0,1421.0,1421.0,1421.0


In [6]:
df.shape

(2800, 2505)

In [7]:
neg, pos = np.bincount(df['Label'])
total = neg + pos
print('Total: {}\nPositive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

Total: 2800
Positive: 437 (15.61% of total)



## Noise reduction

In [8]:
def butter_lowpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [9]:
# Filter requirements.
selected_columns = ['File','PT','StartTime','Label','LeakTime']
filter_val = df.drop(['File','PT','StartTime','Label','LeakTime'], axis = 1)
order = 1
cutoff = 0.1  # desired cutoff frequency of the filter, Hz
# Get the filter coefficients so we can check its frequency response.
b, a = butter_lowpass(cutoff, fs, order)  
    
for index, row in filter_val.iterrows():
    data = row
    y = butter_lowpass_filter(data, cutoff, fs, order)
    filter_val.iloc[index] = np.array(y)
filter_df = pd.concat([select_columns(df, selected_columns), filter_val], axis=1)
filter_df

Unnamed: 0,File,PT,StartTime,Label,LeakTime,s0,s1,s2,s3,s4,...,s2490,s2491,s2492,s2493,s2494,s2495,s2496,s2497,s2498,s2499
0,57,ASK3746,2017-01-07-04:01:22.140,0,1655,17.689351,52.847148,87.565892,121.851065,155.701839,...,1937.072736,1936.797090,1936.524886,1936.256082,1935.978146,1935.691194,1935.407824,1935.134238,1934.870312,1934.609682
1,2146,ASH3341,2019-07-12-08:17:59.370,0,1842,9.391029,28.062054,46.512403,64.732342,82.712262,...,1506.465788,1506.453727,1506.435573,1506.417646,1506.399942,1506.376215,1506.352785,1506.342135,1506.344107,1506.352298
2,2049,ASH3341,2019-06-04-13:24:48.530,0,1409,2.541322,7.592231,12.580063,17.505607,22.369640,...,396.427527,396.428432,396.423082,396.417799,396.412581,396.407429,396.402341,396.397316,396.392355,396.387455
3,1478,HFA3286,2017-05-10-19:17:10.980,1,1754,13.655705,40.790337,67.579865,94.034844,120.159452,...,2181.411879,2181.412980,2181.414066,2181.408896,2181.410033,2181.411157,2181.399778,2181.388542,2181.383690,2181.378898
4,2593,GVT3040,2019-11-07-13:34:07.986,0,1844,8.947702,26.731367,44.292948,61.635219,78.760919,...,1423.537976,1423.506282,1423.474983,1423.444076,1423.413554,1423.383413,1423.353649,1423.324256,1423.295231,1423.266568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,2603,GVT3040,2019-11-07-16:57:05.562,0,800,0.018732,0.055962,0.092728,0.129034,0.164887,...,28.707227,28.729616,28.757968,28.779723,28.801206,28.828665,28.855780,28.882557,28.909000,28.935113
2796,1957,ASK3011,2019-04-16-13:12:50.880,0,483,12.800272,38.240965,63.370197,88.185613,112.684889,...,2037.289417,2037.360731,2037.431155,2037.500699,2037.575619,2037.643359,2037.697765,2037.757735,2037.823201,2037.887849
2797,1431,HDR3039,2017-05-10-03:03:49.770,0,1735,8.910238,26.619442,44.107493,61.377151,78.431145,...,1427.065791,1427.058725,1427.051748,1427.038613,1427.025643,1427.012835,1427.000186,1426.993940,1426.987772,1426.975436
2798,2092,ASH3341,2019-06-20-01:18:58.680,0,1326,3.078309,9.202730,15.263156,21.247899,27.157905,...,364.687750,364.635454,364.590054,364.545221,364.500948,364.450984,364.395400,364.340510,364.286305,364.232777


In [10]:
# Filter requirements.
from scipy.signal import savgol_filter
selected_columns = ['File','PT','StartTime','Label','LeakTime']
filter_val = df.drop(['File','PT','StartTime','Label','LeakTime'], axis = 1)

    
for index, row in filter_val.iterrows():
    data = row
    y = savgol_filter(data,49,1)
    filter_val.iloc[index] = np.array(y)
filter_df = pd.concat([select_columns(df, selected_columns), filter_val], axis=1)
filter_df

Unnamed: 0,File,PT,StartTime,Label,LeakTime,s0,s1,s2,s3,s4,...,s2490,s2491,s2492,s2493,s2494,s2495,s2496,s2497,s2498,s2499
0,57,ASK3746,2017-01-07-04:01:22.140,0,1655,2833.802449,2833.808980,2833.815510,2833.822041,2833.828571,...,1915.294898,1915.058776,1914.822653,1914.586531,1914.350408,1914.114286,1913.878163,1913.642041,1913.405918,1913.169796
1,2146,ASH3341,2019-07-12-08:17:59.370,0,1842,1504.900408,1504.913061,1504.925714,1504.938367,1504.951020,...,1506.698469,1506.702857,1506.707245,1506.711633,1506.716020,1506.720408,1506.724796,1506.729184,1506.733571,1506.737959
2,2049,ASH3341,2019-06-04-13:24:48.530,0,1409,407.002449,406.992143,406.981837,406.971531,406.961224,...,396.305102,396.299592,396.294082,396.288571,396.283061,396.277551,396.272041,396.266531,396.261020,396.255510
3,1478,HFA3286,2017-05-10-19:17:10.980,1,1754,2186.031020,2186.023776,2186.016531,2186.009286,2186.002041,...,2181.022449,2181.021224,2181.020000,2181.018776,2181.017551,2181.016327,2181.015102,2181.013878,2181.012653,2181.011429
4,2593,GVT3040,2019-11-07-13:34:07.986,0,1844,1432.732245,1432.737449,1432.742653,1432.747857,1432.753061,...,1421.030102,1421.025306,1421.020510,1421.015714,1421.010918,1421.006122,1421.001327,1420.996531,1420.991735,1420.986939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,2603,GVT3040,2019-11-07-16:57:05.562,0,800,2.613878,2.648673,2.683469,2.718265,2.753061,...,30.758163,30.754286,30.750408,30.746531,30.742653,30.738776,30.734898,30.731020,30.727143,30.723265
2796,1957,ASK3011,2019-04-16-13:12:50.880,0,483,2050.105306,2050.195306,2050.285306,2050.375306,2050.465306,...,2042.812245,2042.814694,2042.817143,2042.819592,2042.822041,2042.824490,2042.826939,2042.829388,2042.831837,2042.834286
2797,1431,HDR3039,2017-05-10-03:03:49.770,0,1735,1426.920000,1426.921633,1426.923265,1426.924898,1426.926531,...,1426.159694,1426.160816,1426.161939,1426.163061,1426.164184,1426.165306,1426.166429,1426.167551,1426.168673,1426.169796
2798,2092,ASH3341,2019-06-20-01:18:58.680,0,1326,495.597551,495.608367,495.619184,495.630000,495.640816,...,360.665306,360.588571,360.511837,360.435102,360.358367,360.281633,360.204898,360.128163,360.051429,359.974694


## Split the data to train and test

In [11]:
train, test = train_test_split(filter_df, test_size=0.2,random_state=1)
print("Size of:")
print("* Training-set:\t\t{}".format(len(train)))
print("* Test-set:\t\t{}".format(len(test)))

Size of:
* Training-set:		2240
* Test-set:		560


In [12]:
# Save to file
test.to_csv('Test_Data.csv', encoding='utf-8', index=False)
train.to_csv('Train_Data.csv', encoding='utf-8', index=False)