In [1]:
###PART C###

#Importing general libraries

import pandas as pd
import numpy as np

#Loading the concrete data from a downloaded file

concrete_data_raw = pd.read_csv('concrete_data.csv')
concrete_data_raw.head()
concrete_data_raw.shape

#Checking and cleanign the dataset

concrete_data_raw.describe()
concrete_data_raw.isnull().sum()

#Age is not one of the predictors as per the assignment instructions, so it is deleted from the dataset.

concrete_data = concrete_data_raw.drop('Age', axis = 1)

#Separating predictors and target

concrete_data_columns = concrete_data.columns 
predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # All columns except Strength
target = concrete_data['Strength'] # Strength column

#Normalizing data 
predictors_norm = (predictors - predictors.mean()) / predictors.std()
n_cols = predictors_norm.shape[1]
predictors_norm.head()


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569


In [2]:
#Definind the neural network/model

import keras
from keras.models import Sequential
from keras.layers import Dense

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation = 'relu', input_shape = (n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    return model

In [3]:
#Training the model, performing prediction and creating the list of MSEs for Part B

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#Creating an empty list to store MSEs
MSE_B = []

#Performing a loop 50 times to train, test and evaluate the model, and update the MSEs
for counter in range(50):
    
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.3) #Splitting the dataset as per instructions.
    model_B = regression_model()
    model_B.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 50, verbose = 0) #Training the model as per instructions.
    
    y_pred = model_B.predict(X_test) #Using the model to predict the test set
    MSE_B.append(mean_squared_error(y_test, y_pred)) #Updating mean squared error list with the new value
    
    counter += 1 

print(X_train.shape, y_train.shape) #Confirming the successful data splitting operation
print(X_test.shape, y_test.shape, y_pred.shape) #Confirming the successful data splitting operation
print(np.shape(MSE_B)) #Confirming the number of MSEs collected 
print(MSE_B)

print("\nThe Mean of the mean squared errors is: %.4f" % np.mean(MSE_B)) #Outputting the mean of MSEs 
print("\nThe Standard Deviation of the mean squared errors is: %.4f" % np.std(MSE_B)) #Outputting the mean of MSEs

(721, 7) (721,)
(309, 7) (309,) (309, 1)
(50,)
[506.46075852485995, 272.97270257488896, 355.85952551740195, 385.99926656915943, 353.81382241985284, 430.59888387082754, 528.2912976073734, 338.4577425323449, 253.9609010667547, 331.3108952754798, 326.47534074402785, 281.54003336286473, 318.67727371333365, 413.20543467597076, 372.3232208877368, 461.9646321788312, 338.5053035259378, 372.8546001650727, 522.7204011698879, 545.144769874707, 350.47625784882547, 340.0693447497731, 309.22120084140926, 427.9436640080619, 296.391725031313, 258.7692401397005, 483.7697265172345, 349.79195385022183, 281.4836704518158, 394.1637457388579, 330.63580340565034, 352.92590997086864, 220.92835856561092, 574.7211790213328, 235.35991807045679, 373.9247497752037, 465.25368606538046, 270.70668293481475, 278.2784664458693, 586.6643008225777, 482.80496129516644, 426.0697201310687, 282.1484917217831, 252.58145677331527, 354.29815331499384, 323.79685019983657, 269.877573922818, 326.27603891092645, 394.6160876941615, 

In [4]:
#Training the model, performing prediction and creating the list of MSEs for Part C

#Creating an empty list to store MSEs
MSE_C = []

#Performing a loop 50 times to train, test and evaluate the model, and update the MSEs
for counter in range(50):
    
    #X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.3) #Not splitting the dataset, using the same split as in Part B.
    model_C = regression_model()
    model_C.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 100, verbose = 0) #Training the model as per instructions.
    
    y_pred = model_C.predict(X_test) #Using the model to predict the test set
    MSE_C.append(mean_squared_error(y_test, y_pred)) #Updating mean squared error list with the new value
    
    counter += 1 

print(X_train.shape, y_train.shape) #Confirming the successful data splitting operation
print(X_test.shape, y_test.shape, y_pred.shape) #Confirming the successful data splitting operation
print(np.shape(MSE_C)) #Confirming the number of MSEs collected 
print(MSE_C)

print("\nThe Mean of the mean squared errors is: %.4f" % np.mean(MSE_C)) #Outputting the mean of MSEs 
print("\nThe Standard Deviation of the mean squared errors is: %.4f" % np.std(MSE_C)) #Outputting the mean of MSEs

if np.mean(MSE_C) > np.mean(MSE_B):
    print("\nIncreasing the number of epochs has increased the Mean of the mean squared errors.")
elif np.mean(MSE_C) == np.mean(MSE_B):
    print("\nIncreasing the number of epochs has had no effect on the Mean of the mean squared errors.")
else: 
    print("\nIncreasing the number of epochs has decreased the Mean of the mean squared errors.")
    
if np.std(MSE_C) > np.std(MSE_B):
    print("\nIncreasing the number of epochs has increased the Standard Deviation of the mean squared errors.")
elif np.std(MSE_C) == np.std(MSE_B):
    print("\nIncreasing the number of epochs has had no effect on the Standard Deviation of the mean squared errors.")
else: 
    print("\nIncreasing the number of epochs has decreased the Standard Deviation of the mean squared errors.")

(721, 7) (721,)
(309, 7) (309,) (309, 1)
(50,)
[182.8192455969857, 194.8156436681752, 231.68224614614078, 198.5538257174044, 180.260235022132, 193.62835910185794, 167.06412712026838, 204.95329507679142, 187.34318948262379, 190.50604716010602, 188.84359691139053, 172.2038529530189, 195.81969405640496, 187.0716633731644, 191.02627604115744, 182.0719554865546, 196.19425102317663, 181.20936485367105, 182.27726384282997, 176.20173369908926, 184.8292896133672, 211.85015149025745, 195.47604412636846, 181.6420154471188, 181.11562531215236, 197.2049432483077, 194.2575121128852, 182.19674295776576, 204.2022455956072, 202.42300067185957, 203.742503102854, 187.50685136662975, 199.69157669504662, 208.07719218613073, 188.2320800704751, 180.2423777383003, 188.59207520252346, 182.00990604315132, 185.5889193859694, 179.8300172442967, 192.62194958243475, 186.48533970835368, 203.23218530778325, 189.3155495476914, 193.73527618171647, 217.98044927029372, 189.66115172765979, 192.0217483475203, 189.378528199

Discussion: 
    For this part since the intent is to compare and test the change in epoch numbers, the dataset has not been split for Part C and the same dataset
    as in Part B has been used in order to keep the datasets consistent. Increasing the epoch number has significantly decreased the mean and STD of
    MSEs.