# Importing libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from sklearn.model_selection import train_test_split

# Importing the dataset

In [4]:
data=pd.read_csv('gas_turbines.csv')
data.head() #retrieving the first 5 records

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO,NOX
0,6.8594,1007.9,96.799,3.5,19.663,1059.2,550.0,114.7,10.605,3.1547,82.722
1,6.785,1008.4,97.118,3.4998,19.728,1059.3,550.0,114.72,10.598,3.2363,82.776
2,6.8977,1008.8,95.939,3.4824,19.779,1059.4,549.87,114.71,10.601,3.2012,82.468
3,7.0569,1009.2,95.249,3.4805,19.792,1059.6,549.99,114.72,10.606,3.1923,82.67
4,7.3978,1009.7,95.15,3.4976,19.765,1059.7,549.98,114.72,10.612,3.2484,82.311


# EDA

In [5]:
print('The dimension of the dataset is {}'.format(data.shape)) #no. of rows and columns in the dataset

The dimension of the dataset is (15039, 11)


## Checking for null values:

In [6]:
data.isnull().sum() #there are no null values

AT      0
AP      0
AH      0
AFDP    0
GTEP    0
TIT     0
TAT     0
TEY     0
CDP     0
CO      0
NOX     0
dtype: int64

In [7]:
data.info() #checking the data types of each attribute

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15039 entries, 0 to 15038
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      15039 non-null  float64
 1   AP      15039 non-null  float64
 2   AH      15039 non-null  float64
 3   AFDP    15039 non-null  float64
 4   GTEP    15039 non-null  float64
 5   TIT     15039 non-null  float64
 6   TAT     15039 non-null  float64
 7   TEY     15039 non-null  float64
 8   CDP     15039 non-null  float64
 9   CO      15039 non-null  float64
 10  NOX     15039 non-null  float64
dtypes: float64(11)
memory usage: 1.3 MB


In [8]:
data.describe() #all the statistics of te features

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO,NOX
count,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0
mean,17.764381,1013.19924,79.124174,4.200294,25.419061,1083.79877,545.396183,134.188464,12.102353,1.972499,68.190934
std,7.574323,6.41076,13.793439,0.760197,4.173916,16.527806,7.866803,15.829717,1.103196,2.222206,10.470586
min,0.5223,985.85,30.344,2.0874,17.878,1000.8,512.45,100.17,9.9044,0.000388,27.765
25%,11.408,1008.9,69.75,3.7239,23.294,1079.6,542.17,127.985,11.622,0.858055,61.3035
50%,18.186,1012.8,82.266,4.1862,25.082,1088.7,549.89,133.78,12.025,1.3902,66.601
75%,23.8625,1016.9,90.0435,4.5509,27.184,1096.0,550.06,140.895,12.578,2.1604,73.9355
max,34.929,1034.2,100.2,7.6106,37.402,1100.8,550.61,174.61,15.081,44.103,119.89


###### I am using dtale library to obtain all the visualisation plots and statistics of the dataset

In [9]:
import dtale
dtale.show(data)



###### Additionally using sweetviz library to obtain the statistics.

In [10]:
import sweetviz as sv
report=sv.analyze(data)
report.show_html('sweet_report.html')

                                             |          | [  0%]   00:00 -> (? left)

Report sweet_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [11]:
X=data.drop('TEY',axis=1) #Getting all the independent features
y=data['TEY'] #Dependent feature

In [12]:
data.columns #Names of all the columns

Index(['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'CO',
       'NOX'],
      dtype='object')

# Feature Selection

##### The Artificial neural network assumes that the input features are independent. There is a possibility of  multicollinearity between the variables. Thus I will set a threshold and whichever has the correlation more than the threshold will be dropped.

In [13]:
plt.figure(figsize=(12,10))
corr=X.corr()
sns.heatmap(corr,annot=True,cmap=plt.cm.CMRmap_r)
plt.show()


Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.



In [14]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset,threshold):
    col_corr=set()
    corr_matrix=dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if(abs(corr_matrix.iloc[i,j]>threshold)):
                colname=corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [15]:
corr_features = correlation(X, 0.7) #threshold is 0.7
len(set(corr_features))

3

In [16]:
corr_features

{'CDP', 'GTEP', 'TIT'}

In [17]:
X.drop(corr_features,axis=1) #dropping the features which are highly correlated.

Unnamed: 0,AT,AP,AH,AFDP,TAT,CO,NOX
0,6.8594,1007.9,96.799,3.5000,550.00,3.1547,82.722
1,6.7850,1008.4,97.118,3.4998,550.00,3.2363,82.776
2,6.8977,1008.8,95.939,3.4824,549.87,3.2012,82.468
3,7.0569,1009.2,95.249,3.4805,549.99,3.1923,82.670
4,7.3978,1009.7,95.150,3.4976,549.98,3.2484,82.311
...,...,...,...,...,...,...,...
15034,9.0301,1005.6,98.460,3.5421,546.21,4.5186,79.559
15035,7.8879,1005.9,99.093,3.5059,543.22,4.8470,79.917
15036,7.2647,1006.3,99.496,3.4770,537.32,7.9632,90.912
15037,7.0060,1006.8,99.008,3.4486,541.24,6.2494,93.227


# Feature Scaling:

##### ### Performing Standardisation using sklearn MinmaxSclaer and fitting it to train and test dataset.Since the high magnitude features are dominant over the low magnitude ones, we perform standarisation of the features using the MinmaxSclaer. This will transform the values between 0 and 1.

In [18]:
from sklearn.preprocessing import MinMaxScaler
Scaler=MinMaxScaler()
X=Scaler.fit_transform(X)
y=y.to_numpy()
print(X)
print(y)

[[0.18418215 0.45604964 0.95131413 ... 0.1353398  0.07152212 0.59654817]
 [0.18201978 0.4663909  0.95588067 ... 0.13398756 0.07337235 0.59713433]
 [0.18529531 0.47466391 0.93900309 ... 0.13456709 0.07257648 0.59379104]
 ...
 [0.19596183 0.4229576  0.98992213 ... 0.11177221 0.18055195 0.68544912]
 [0.18844295 0.43329886 0.98293633 ... 0.12143106 0.14169257 0.71057802]
 [0.18617304 0.44157187 0.96182146 ... 0.1310899  0.11294597 0.70266486]]
[114.7  114.72 114.71 ... 110.19 110.74 111.58]


In [19]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=56) #splitting the train and test dataset with
#test size as 0.3 and random state as 56 for reproducibility.

# Building Model:

##### Building an Artificial neural network and performing hyperparameter tuning on the no. of layers and the neurons. Activation function is linear since this is a regression problem.

In [18]:
#defining the function build_model
def build_model(hp):
    model = keras.Sequential()
    for i in range(hp.Int('num_layers', 2, 20)):
        model.add(layers.Dense(units=hp.Int('units',
                                            min_value=32,
                                            max_value=512,
                                            step=32),
                               activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='mean_absolute_error',
        metrics=['mean_absolute_error'])
    return model

In [22]:
tuner=RandomSearch(
   build_model,
   objective='val_mean_absolute_error',
   max_trials=5,
   executions_per_trial=3,
   directory='project',
   project_name='Gas Turbine_1')

In [23]:
tuner.search_space_summary()

In [24]:
tuner.search(x_train, y_train,
             epochs=5,
             validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


INFO:tensorflow:Oracle triggered exit


2021-09-03 16:58:54,094 - INFO     - Oracle triggered exit


In [25]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal no. of layers is {best_hps.get('num_layers')}The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")


The hyperparameter search is complete. The optimal no. of layers is 15The optimal number of units in the first densely-connected
layer is 448 and the optimal learning rate for the optimizer
is 0.0001.



In [29]:
#Building the model with the best hyperparameters
hypermodel = tuner.hypermodel.build(best_hps)

In [32]:
hypermodel.fit(x_train, y_train, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1dbdd7fa340>

In [39]:
eval_result = hypermodel.evaluate(x_test, y_test)

test accuracy 97.53392505645752


In [40]:
print("test accuracy", (100-eval_result[0]))

test accuracy 97.53392505645752


##### We have got an accuracy of 97.533 which is great.