## A regression model using the Keras library to model data about concrete compressive strength
### by Ahmad Salih

### About the Data

<strong>The dataset is about the compressive strength of different samples of concrete based on the volumes of the different ingredients that were used to make them. Ingredients include:</strong>

<strong>1. Cement</strong>

<strong>2. Blast Furnace Slag</strong>

<strong>3. Fly Ash</strong>

<strong>4. Water</strong>

<strong>5. Superplasticizer</strong>

<strong>6. Coarse Aggregate</strong>

<strong>7. Fine Aggregate</strong>

In [4]:
!conda install pandas

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [6]:
import numpy as np
import pandas as pd

In [7]:
## data exploration

data = pd.read_csv('concrete_data.csv')
print(data.shape)
data.head()

(1030, 9)


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [8]:
data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [17]:
features = data[data.columns[data.columns != 'Strength']]
targets = data['Strength']

features.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0


### Part A: Build a baseline model

In [9]:
import keras

Using TensorFlow backend.


In [12]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

In [28]:
from sklearn.metrics import mean_squared_error

In [20]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.3, random_state=1)

X_train.shape, y_train.shape

((721, 8), (721,))

In [24]:
def baseline_model(input_shape):
    
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=input_shape))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [25]:
base_model = baseline_model(input_shape=(X_train.shape[1],))

base_model.fit(X_train, y_train, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7fa1a3b1a828>

In [27]:
yhat = base_model.predict(X_test)

In [31]:
# print(yhat[:5])
# print(y_test[:5])

print('mean squared error : {}'.format(mean_squared_error(y_test, yhat)))

mean squared error : 419.4632861383443


In [49]:
ms_errors = np.zeros((50,)) # mean_squared_errors list

for i in range(len(ms_errors)):
    
    X_train,X_test, y_train, y_test = train_test_split(features, targets, test_size=0.3) # random split in each loop
    
    base_model = baseline_model(input_shape=(X_train.shape[1],))
    base_model.fit(X_train, y_train, epochs=50, verbose=0)
    
    predictions = base_model.predict(X_test)
    
    ms_errors[i] = mean_squared_error(y_test, predictions)

In [50]:
print('The mean of the predicted values from the models after 50 iterations with random \
training and test sets : {}'.format(np.around(np.mean(ms_errors), decimals=3)))
print('=============================================================>')
print('The standard deviation of the predicted values from the models after 50 iterations with random \
training and test sets : {}'.format(np.around(np.std(ms_errors), decimals=3)))

The mean of the predicted values from the models after 50 iterations with random training and test sets : 368.552
The standard deviation of the predicted values from the models after 50 iterations with random training and test sets : 374.345


### Part B: Normalize the data

In [52]:
features_norm = (features - features.mean()) / features.std()
features_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [69]:
def get_n_mse(n, epochs, features, targets):
    
    """Get n mean squared errors
    
    Return: array(length=n) of mean squared errors after n iterations
    """
    
    ms_errors = np.zeros((n,)) # mean_squared_errors list

    for i in range(len(ms_errors)):

        X_train,X_test, y_train, y_test = train_test_split(features, targets, test_size=0.3) # random split in each loop

        base_model = baseline_model(input_shape=(X_train.shape[1],)) # predefined model function
        base_model.fit(X_train, y_train, epochs=epochs, verbose=0)

        predictions = base_model.predict(X_test)

        ms_errors[i] = mean_squared_error(y_test, predictions) # get mse and add to list of ms_errors
    
    return ms_errors;

In [71]:
ms_errors_1 = get_n_mse(n=50, epochs=50, features=features_norm, targets=targets)

In [72]:
print('The mean of the predicted values from the models after 50 iterations with random \
training and test sets : {}'.format(np.around(np.mean(ms_errors_1), decimals=3)))
print('=============================================================>')
print('The standard deviation of the predicted values from the models after 50 iterations with random \
training and test sets : {}'.format(np.around(np.std(ms_errors_1), decimals=3)))

The mean of the predicted values from the models after 50 iterations with random training and test sets : 361.608
The standard deviation of the predicted values from the models after 50 iterations with random training and test sets : 88.518


### Part C: repeat part B, but increase epochs to 100

In [63]:
ms_errors_2 = get_n_mse(n=50, epochs=100, features=features_norm, targets=targets)

In [64]:
print('The mean of the predicted values from the models after 50 iterations with random \
training and test sets : {}'.format(np.around(np.mean(ms_errors_2), decimals=3)))
print('=============================================================>')
print('The standard deviation of the predicted values from the models after 50 iterations with random \
training and test sets : {}'.format(np.around(np.std(ms_errors_2), decimals=3)))

The mean of the predicted values from the models after 50 iterations with random training and test sets : 161.831
The standard deviation of the predicted values from the models after 50 iterations with random training and test sets : 16.05


### Part D: increase hidden layers to 3 each with 10 nodes

In [68]:
new_model = Sequential()

new_model.add(Dense(10, activation='relu', input_shape=(features.shape[1],)))
new_model.add(Dense(10, activation='relu'))
new_model.add(Dense(10, activation='relu'))
new_model.add(Dense(1))
    
new_model.compile(optimizer='adam', loss='mean_squared_error')

In [75]:
ms_errors_3 = np.zeros((50,)) # mean_squared_errors list

for i in range(len(ms_errors_3)):

    X_train,X_test, y_train, y_test = train_test_split(features_norm, targets, test_size=0.3) # random split in each loop

    base_model = new_model # predefined model function
    base_model.fit(X_train, y_train, epochs=50, verbose=0)

    predictions = base_model.predict(X_test)

    ms_errors_3[i] = mean_squared_error(y_test, predictions) # get mse and add to list of ms_errors

In [77]:
print('The mean of the predicted values from the models after 50 iterations with random \
training and test sets : {}'.format(np.around(np.mean(ms_errors_3), decimals=3)))
print('=============================================================>')
print('The standard deviation of the predicted values from the models after 50 iterations with random \
training and test sets : {}'.format(np.around(np.std(ms_errors_3), decimals=3)))

The mean of the predicted values from the models after 50 iterations with random training and test sets : 15.507
The standard deviation of the predicted values from the models after 50 iterations with random training and test sets : 1.707
