In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("1_boston_housing.csv")

In [3]:
# Print the Dataset
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [4]:
# Column Names and Description :

# crim : per capita crime rate by town
# zn : proportion of residential land zoned for over 25000 sqft
# indus : proportion of non-retail business acres per town
# chas : 1 if tract bounds river, 0 otherwise
# nox : nitric oxide concentration ppm
# rm : average number of rooms per dwelling
# age : proportion of owner houses built before 1940
# dis : weighted distances to 5 Boston employment centres
# rad : accessibility to radial highways
# tax : property tax rate
# ptratio : pupil-teacher ratio in the town
# b : proportion of black people in the town
# lstat : % lower status of population
# medv : median value price of homes

In [5]:
# Check for Null Values

df.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
MEDV       0
dtype: int64

In [6]:
# Observe Co-relation between input features and the output variable MEDV
# Positive Co-relation indicates that value of a variable increases as another increases
# Negative Co-relation indicates that value of a variable decreases as another increases and vice-versa

df.corr()['MEDV'].sort_values()

lstat     -0.737663
ptratio   -0.507787
indus     -0.483725
tax       -0.468536
nox       -0.427321
crim      -0.388305
rad       -0.381626
age       -0.376955
chas       0.175260
dis        0.249929
b          0.333461
zn         0.360445
rm         0.695360
MEDV       1.000000
Name: MEDV, dtype: float64

In [7]:
# lstat, ptratio and rm are the features that significantly influence the output variable
# However, the results when only these 3 are considered, vs when all features are considered are not drastically different

In [8]:
# X = df.loc[:,['lstat', 'ptratio', 'rm']] in case you want to consider only the high impact features

In [9]:
from sklearn.model_selection import train_test_split

X = df.loc[:,df.columns!='MEDV']
Y = df.loc[:,df.columns=='MEDV']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25,random_state=123)

In [10]:
# MinMax Scaling is a technique used for Normalization,
# So that all the features fall in a specified range, typically from 0 to 1
# The formula is as follows :
# X = (X-Xmin)/(Xmax-Xmin)

# StandardScaling is another Normalization technique,
# It scales the data such that mean of each feature is 0, and the variance is 1
# Since linear regression works well when the featues are normally distributed,
# Robustness to outliers, and preservation of shape of the distribution,
# StandardScaling might be a better normalization technique in our current use case for the given dataset and linear regression

# The formula is as follows :
# Xscaled = (X-u)/σ
# Where u is the mean and σ is the standard deviation

# Hence, here we use StandardScaling

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Why do we fit only the X_train and not the X_test?
# We use x_train only for fitting the data,
# To generalize rules that we are going to use when we transform both training and testing data
# Having a peek at X_test to fit defeats the purpose of testing data

In [12]:
from tensorflow.keras.models import Sequential
# A sequential model, where output of one layer is the input for the next layer

from tensorflow.keras.layers import Dense
# Dense is a type of layer in which reprents a fully connected layer,
# Where each neuron in layer receives input from every neuron in previous layer

model = Sequential()

model.add(Dense(128, input_shape=(13,), activation = 'relu', name = 'layer1'))
# Dense is the type of layer,
# Parameters are number of neurons, input features, activation function and name of the layer
# relu activation function -> rectified linear unit
# f(x) = max(0,x)
# Introduces non linearity in the network, to learn about complex relationships within the data

model.add(Dense(64,activation='relu',name = 'layer2'))
model.add(Dense(1,activation ='linear',name ='output_layer'))

# We have only 1 neuron in output layer,that too with linear activation function for our regression task
# linear activation function > f(x) = x
# Essential so that we obtain only continuous output for our regression

model.compile(optimizer='adam',loss='mse',metrics=['mae'])
# adam optimizer uses adaptive learning ie its learning rate changes during training
# Also utilizes the 'momentum' concept to adjust the gradient descent
# adam optimizer is used to adjust the weights such that the loss metric 'mse' is minimized during training
# The 'mae' metric will be observed at each training epoch

model.summary()





Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 layer1 (Dense)              (None, 128)               1792      
                                                                 
 layer2 (Dense)              (None, 64)                8256      
                                                                 
 output_layer (Dense)        (None, 1)                 65        
                                                                 
Total params: 10113 (39.50 KB)
Trainable params: 10113 (39.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
# The above provides the model summary,
# Parameters indicate the total weights and biases in each layer
# In layer 1,
# We have 13(inputs)*128(weights)+128 bias for each neuron = 1792 paramters
# For layer 2,
# We have 128(inputs)*64(weights)+64 biases for each neuron = 8256 paramteres
# Recall that it is a fully connected dense layer.

# Trainable parameters are the parameters that will be updated by backpropagation during training
# Non-trainable parameters are the ones that will not be updated.

# Now our model is defined and compiled,we have also given a summary of the model's architecture.
# It is ready to be trained on data

In [14]:
history = model.fit(X_train,Y_train,epochs=100,validation_split=0.05,verbose=1)

# Epochs represents the number of times the entire training dataset will be passed forward and backward through the neural network
# validation_split is the fraction of data to be used as validation data from the end of training data
# For example if you have 1000 samples
# and validation_split is 0.05, 50 samples from the end will be used for validation and remaining 950 for training.
# Verbose field is to decide whether or not to show the progress bars

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 

In [15]:
Y_pred = model.predict(x=X_test)



In [16]:
for pred, actual in zip(Y_pred,Y_test.values):
  print(f"Predicted : {pred}, Actual : {actual}")

Predicted : [19.009132], Actual : [15.]
Predicted : [27.436731], Actual : [26.6]
Predicted : [43.518715], Actual : [45.4]
Predicted : [19.87408], Actual : [20.8]
Predicted : [31.250431], Actual : [34.9]
Predicted : [50.07557], Actual : [21.9]
Predicted : [26.019737], Actual : [28.7]
Predicted : [8.766456], Actual : [7.2]
Predicted : [19.561232], Actual : [20.]
Predicted : [33.071377], Actual : [32.2]
Predicted : [22.124668], Actual : [24.1]
Predicted : [20.0587], Actual : [18.5]
Predicted : [14.527017], Actual : [13.5]
Predicted : [29.707294], Actual : [27.]
Predicted : [18.018066], Actual : [23.1]
Predicted : [18.999386], Actual : [18.9]
Predicted : [19.395557], Actual : [24.5]
Predicted : [38.669365], Actual : [43.1]
Predicted : [17.025154], Actual : [19.8]
Predicted : [17.040472], Actual : [13.8]
Predicted : [12.748427], Actual : [15.6]
Predicted : [30.437801], Actual : [50.]
Predicted : [35.00664], Actual : [37.2]
Predicted : [39.58142], Actual : [46.]
Predicted : [42.844913], Actu

In [17]:
mse, mae = model.evaluate(X_test,Y_test)

print("Mean Absolute Error in Prediction : ", mae)
print("Mean Squared Error in Prediction : ", mse)

Mean Absolute Error in Prediction :  2.5407729148864746
Mean Squared Error in Prediction :  17.589706420898438
