# Data Preprocessing - Normalization


## Introduction

This code is part of Fuel Leak Detection and Location Project.
This code creats file for train data and for test data.

## Imports and Global Definitions

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from io import StringIO
from IPython.display import clear_output
%matplotlib inline
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.model_selection import train_test_split


In [2]:
label_colors = ["#F58C41", "#2FAFC6", "#800080", "#AF1946", 
                "#46A5E1", "#522A64", "#A3DB05", "#FC6514"]

In [3]:
def select_columns(data_frame, column_names):
    new_frame = data_frame.loc[:, column_names]
    return new_frame

## Data Loading

In [4]:
#read data file
test_df= pd.read_csv('Test_Data.csv') # Or Train_Data
test_df.head()

Unnamed: 0,File,PT,StartTime,Label,LeakTime,s0,s1,s2,s3,s4,...,s2490,s2491,s2492,s2493,s2494,s2495,s2496,s2497,s2498,s2499
0,1602,ASK3746,2017-05-21-12:55:47.690,0,1865,995.191837,995.182143,995.172449,995.162755,995.153061,...,294.830102,294.451429,294.072755,293.694082,293.315408,292.936735,292.558061,292.179388,291.800714,291.422041
1,2790,ASH3041,2019-11-16-19:42:00.690,0,1877,1422.160816,1422.160918,1422.16102,1422.161122,1422.161224,...,1436.592347,1436.574694,1436.557041,1436.539388,1436.521735,1436.504082,1436.486429,1436.468776,1436.451122,1436.433469
2,1844,ASH3041,2018-11-19-16:08:16.460,0,2256,1464.364898,1464.362449,1464.36,1464.357551,1464.355102,...,1416.17449,1415.977959,1415.781429,1415.584898,1415.388367,1415.191837,1414.995306,1414.798776,1414.602245,1414.405714
3,2168,ASH3341,2019-07-22-18:20:50.510,0,270,788.8,788.944388,789.088776,789.233163,789.377551,...,778.605612,778.573878,778.542143,778.510408,778.478673,778.446939,778.415204,778.383469,778.351735,778.32
4,319,GLT3031,2017-01-27-17:24:03.400,0,672,1238.697143,1238.741224,1238.785306,1238.829388,1238.873469,...,1139.868367,1139.88,1139.891633,1139.903265,1139.914898,1139.926531,1139.938163,1139.949796,1139.961429,1139.973061


In [5]:
neg, pos = np.bincount(test_df['Label'])
total = neg + pos
print('Total: {}\nPositive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

Total: 560
Positive: 86 (15.36% of total)



In [6]:
#read data file
train_df= pd.read_csv('Train_Data.csv')
train_df.head()

Unnamed: 0,File,PT,StartTime,Label,LeakTime,s0,s1,s2,s3,s4,...,s2490,s2491,s2492,s2493,s2494,s2495,s2496,s2497,s2498,s2499
0,2559,HDR3039,2019-11-06-13:32:24.20,0,7,1118.682449,1118.52051,1118.358571,1118.196633,1118.034694,...,1092.985714,1093.014694,1093.043673,1093.072653,1093.101633,1093.130612,1093.159592,1093.188571,1093.217551,1093.246531
1,2714,GLT3023,2019-11-12-06:56:56.300,0,1168,1248.214694,1248.299286,1248.383878,1248.468469,1248.553061,...,1364.319388,1364.329796,1364.340204,1364.350612,1364.36102,1364.371429,1364.381837,1364.392245,1364.402653,1364.413061
2,766,HDR3039,2017-03-16-16:25:39.60,0,1206,842.676735,842.685102,842.693469,842.701837,842.710204,...,764.152041,764.148571,764.145102,764.141633,764.138163,764.134694,764.131224,764.127755,764.124286,764.120816
3,302,ESH3427,2017-01-26-11:53:42.530,0,1960,1413.068571,1413.203469,1413.338367,1413.473265,1413.608163,...,1483.441327,1483.440816,1483.440306,1483.439796,1483.439286,1483.438776,1483.438265,1483.437755,1483.437245,1483.436735
4,2024,ASK3746,2019-05-21-09:15:36.150,0,21,2427.726531,2427.537245,2427.347959,2427.158673,2426.969388,...,2456.765816,2456.750204,2456.734592,2456.71898,2456.703367,2456.687755,2456.672143,2456.656531,2456.640918,2456.625306


In [7]:
train_df.shape

(2240, 2505)

In [8]:
neg, pos = np.bincount(train_df['Label'])
total = neg + pos
print('Total: {}\nPositive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

Total: 2240
Positive: 351 (15.67% of total)



## Normalize Data

In [9]:
#Normalize the data to 0-100 scale
normal_val = train_df.drop(['File','PT','StartTime','Label','LeakTime'], axis = 1)
points =[x for x in range(0, 2500)]
for index, row in normal_val.iterrows():
    data = row
    maxVal = data.max()
    minVal = data.min()
    normal_val.at[index,'maxVal'] = maxVal
    normal_val.at[index,'minVal'] = minVal

normal_val['h'] = normal_val['maxVal'] - normal_val['minVal']
for x in points:
    point = 's'+str(x)
    normal_val[point] = (normal_val[point] - normal_val['minVal'])/normal_val['h']*100
    if (x % 10) == 1:
        clear_output(wait=True)
        print(x)

2491


In [10]:
selected_columns = ['File','PT','StartTime','Label','LeakTime']
#normal_val.drop(['maxVal','minVal','h'], axis = 1, inplace=True)
normal_df = pd.concat([select_columns(train_df, selected_columns), normal_val], axis=1)
#remove nan or infinite value
normal_df.replace(np.inf, 0)
normal_df.replace(np.nan, 0, inplace=True)


In [11]:
normal_df = normal_df.drop(['maxVal','minVal','h'], axis = 1)
normal_df.head()

Unnamed: 0,File,PT,StartTime,Label,LeakTime,s0,s1,s2,s3,s4,...,s2490,s2491,s2492,s2493,s2494,s2495,s2496,s2497,s2498,s2499
0,2559,HDR3039,2019-11-06-13:32:24.20,0,7,78.986918,78.57169,78.156463,77.741235,77.326007,...,13.097855,13.172161,13.246468,13.320774,13.395081,13.469388,13.543694,13.618001,13.692308,13.766614
1,2714,GLT3023,2019-11-12-06:56:56.300,0,1168,0.0,0.0728,0.145599,0.218399,0.291198,...,99.919385,99.928342,99.937299,99.946257,99.955214,99.964171,99.973128,99.982086,99.991043,100.0
2,766,HDR3039,2017-03-16-16:25:39.60,0,1206,95.088042,95.09817,95.108298,95.118427,95.128555,...,0.037796,0.033596,0.029397,0.025197,0.020998,0.016798,0.012599,0.008399,0.0042,0.0
3,302,ESH3427,2017-01-26-11:53:42.530,0,1960,0.0,0.132741,0.265481,0.398222,0.530962,...,69.247275,69.246773,69.246271,69.245769,69.245267,69.244765,69.244263,69.243761,69.243259,69.242757
4,2024,ASK3746,2019-05-21-09:15:36.150,0,21,57.139665,56.880587,56.621508,56.36243,56.103352,...,96.886173,96.864804,96.843436,96.822067,96.800698,96.77933,96.757961,96.736592,96.715223,96.693855


In [12]:
normal_df.to_csv('Train_Data_Normal.csv', encoding='utf-8', index=False)

In [13]:
#Normalize the data to 0-100 scale
normal_val = test_df.drop(['File','PT','StartTime','Label','LeakTime'], axis = 1)
points =[x for x in range(0, 2500)]
for index, row in normal_val.iterrows():
    data = row
    maxVal = data.max()
    minVal = data.min()
    normal_val.at[index,'maxVal'] = maxVal
    normal_val.at[index,'minVal'] = minVal

normal_val['h'] = normal_val['maxVal'] - normal_val['minVal']
for x in points:
    point = 's'+str(x)
    normal_val[point] = (normal_val[point] - normal_val['minVal'])/normal_val['h']*100
    if (x % 10) == 1:
        clear_output(wait=True)
        print(x)

2491


In [14]:
selected_columns = ['File','PT','StartTime','Label','LeakTime']
#normal_val.drop(['maxVal','minVal','h'], axis = 1, inplace=True)
normal_df = pd.concat([select_columns(test_df, selected_columns), normal_val], axis=1)
#remove nan or infinite value
normal_df.replace(np.inf, 0)
normal_df.replace(np.nan, 0, inplace=True)


In [15]:
normal_df = normal_df.drop(['maxVal','minVal','h'], axis = 1)
normal_df.head()

Unnamed: 0,File,PT,StartTime,Label,LeakTime,s0,s1,s2,s3,s4,...,s2490,s2491,s2492,s2493,s2494,s2495,s2496,s2497,s2498,s2499
0,1602,ASK3746,2017-05-21-12:55:47.690,0,1865,99.867942,99.866567,99.865191,99.863815,99.86244,...,0.483618,0.429883,0.376148,0.322412,0.268677,0.214942,0.161206,0.107471,0.053735,0.0
1,2790,ASH3041,2019-11-16-19:42:00.690,0,1877,22.353949,22.354186,22.354424,22.354662,22.3549,...,55.99548,55.954329,55.913178,55.872027,55.830875,55.789724,55.748573,55.707422,55.66627,55.625119
2,1844,ASH3041,2018-11-19-16:08:16.460,0,2256,98.744716,98.739875,98.735035,98.730195,98.725354,...,3.495999,3.107554,2.71911,2.330666,1.942221,1.553777,1.165333,0.776889,0.388444,0.0
3,2168,ASH3341,2019-07-22-18:20:50.510,0,270,49.399894,49.775624,50.151354,50.527084,50.902815,...,22.871747,22.789166,22.706585,22.624004,22.541423,22.458842,22.376261,22.29368,22.211099,22.128518
4,319,GLT3031,2017-01-27-17:24:03.400,0,672,92.930122,92.966757,93.003392,93.040027,93.076662,...,10.796303,10.80597,10.815638,10.825305,10.834973,10.84464,10.854308,10.863976,10.873643,10.883311


In [16]:
normal_df.to_csv('Test_Data_Normal.csv', encoding='utf-8', index=False)