## Import required packages.

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier, MLPRegressor 
from sklearn.preprocessing import StandardScaler

#from mord import LogisticIT

from dmba import classificationSummary, regressionSummary

%matplotlib inline
import matplotlib.pylab as plt

### 1A. Create a boston_df data frame by uploading the original data set into Python. Determine and present in this report the data frame dimensions, i.e., number of rows and columns.

In [2]:
boston_df = pd.read_csv("BostonHousing.csv")
print("Number of Rows ",boston_df.shape[0])
print("Number of Columns: ",boston_df.shape[1])

Number of Rows  506
Number of Columns:  14


### 1B. Display in Python the column titles. If some of them contain two (or more) words, convert them into one-word titles, and present the modified titles in your report.

In [3]:
# We will strip trailing spaces and replace the remaining spaces with an underscore _. Instead of using 
# the `rename` method, we  create a modified copy of `columns` and assign to the `columns` field of the dataframe.

print('Modified column titles with no space and one word for titles:',"\n")
boston_df.columns = [s.strip().replace(' ', '_') for s in boston_df.columns]
print(boston_df.columns)

Modified column titles with no space and one word for titles: 

Index(['CRIME', 'ZONE', 'INDUST', 'CHAR_RIV', 'NIT_OXIDE', 'ROOMS', 'AGE',
       'DISTANCE', 'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'MVALUE',
       'C_MVALUE'],
      dtype='object')


### 1C. Display in Python column data types. If some of them are listed as “object’, convert them into dummy variables, and provide in your report the modified list of column titles with dummy variables.

In [4]:
print(boston_df.dtypes)

CRIME        float64
ZONE         float64
INDUST       float64
CHAR_RIV      object
NIT_OXIDE    float64
ROOMS        float64
AGE          float64
DISTANCE     float64
RADIAL         int64
TAX            int64
ST_RATIO     float64
LOW_STAT     float64
MVALUE       float64
C_MVALUE      object
dtype: object


In [5]:
# WE Can see only two columns have object data type:
print("Column with Object data type in the dataset are: ")
for i in boston_df.columns:
    if boston_df[i].dtype == "O":
        print(i)
            

# Lets change these variable type to 'category'.
boston_df.CHAR_RIV = boston_df.CHAR_RIV.astype('category')
boston_df.C_MVALUE = boston_df.C_MVALUE.astype('category')


# Display category levels (attributes) and category type FOR CHAR_RIV
print("")
print('Category levels and changed variable type of CHAR_RIV:')
print(boston_df.CHAR_RIV.cat.categories) 
print(boston_df.CHAR_RIV.dtype)


# Display category levels (attributes) and category type FOR C_MVALUE
print("")
print('Category levels and changed variable type of C_MVALUE:')
print(boston_df.C_MVALUE.cat.categories) 
print(boston_df.C_MVALUE.dtype)


Column with Object data type in the dataset are: 
CHAR_RIV
C_MVALUE

Category levels and changed variable type of CHAR_RIV:
Index(['N', 'Y'], dtype='object')
category

Category levels and changed variable type of C_MVALUE:
Index(['No', 'Yes'], dtype='object')
category


In [6]:
# Lets convert these categorical column to Dummy vraibales
boston_df = pd.get_dummies(boston_df, prefix_sep='_', drop_first=True)
boston_df.columns

Index(['CRIME', 'ZONE', 'INDUST', 'NIT_OXIDE', 'ROOMS', 'AGE', 'DISTANCE',
       'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'MVALUE', 'CHAR_RIV_Y',
       'C_MVALUE_Yes'],
      dtype='object')

### 2A. Develop in Python the outcome and predictor variables, partition the data set (60% for training and 40% for validation partitions), display in Python and present in your report the first five records of the training partition. Then, using the StandardScaler() function, develop the scaled predictors for training and validation partitions. Display in Python and provide in your report the first five records of the scaled training partition. Present a brief explanation of what the scaled values mean and how they are calculated.

In [7]:
#Partitioning Data

# Identify predictors(col names) and outcome(target Name) of the regression model]\
# These are just names ( not data)
predictors = ['CRIME', 'ZONE', 'INDUST', 'NIT_OXIDE', 'ROOMS', 'AGE', 'DISTANCE',
       'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'CHAR_RIV_Y',
       'C_MVALUE_Yes']

outcome = 'MVALUE'


# Identify X and y variables for regression and partition data
# using 60% of records for training and 40% for validation (test_size=0.4). 
X = boston_df[predictors]
Y = boston_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, Y, test_size=0.4, random_state=1)

print("First 5 records of Training Dataset are: ")
print(train_X.head(5),"\n"*3)

#Standardize All predictor value in train_x and valid_x with standard scaler
sc_X = StandardScaler()
sc_X.fit(train_X)
train_X_sc = pd.DataFrame(sc_X.transform(train_X), columns= train_X.columns)
valid_X_sc = pd.DataFrame(sc_X.transform(valid_X), columns= valid_X.columns)

print("First 5 records of Standardize Training Dataset are: ")
print(train_X_sc.head(5))

First 5 records of Training Dataset are: 
       CRIME  ZONE  INDUST  NIT_OXIDE  ROOMS   AGE  DISTANCE  RADIAL  TAX  \
452  5.09017   0.0   18.10      0.713  6.297  91.8    2.3682      24  666   
346  0.06162   0.0    4.39      0.442  5.898  52.3    8.0136       3  352   
295  0.12932   0.0   13.92      0.437  6.678  31.1    5.9604       4  289   
88   0.05660   0.0    3.41      0.489  7.007  86.3    3.4217       2  270   
322  0.35114   0.0    7.38      0.493  6.041  49.9    4.7211       5  287   

     ST_RATIO  LOW_STAT  CHAR_RIV_Y  C_MVALUE_Yes  
452      20.2     17.27           0             0  
346      18.8     12.67           0             0  
295      16.0      6.27           0             0  
88       17.8      5.50           0             0  
322      19.6      7.70           0             0   



First 5 records of Standardize Training Dataset are: 
      CRIME      ZONE    INDUST  NIT_OXIDE     ROOMS       AGE  DISTANCE  \
0  0.145983 -0.481603  1.005718   1.306155  0.083

### 2B. Train a neural network model using MLPRegressor() with the scaled training data set and the following parameters: hidden_layer_sizes=9, solver=’lbfgs’, max_iter=10000, and random_state=1. Identify and display in Python the final intercepts and network weights of this model. Provide these intercepts and weights in your report and briefly explain what the values of intercepts in the first and second arrays mean. Also, briefly explain what the values of weights in the first and second arrays mean.

In [8]:
boston_reg = MLPRegressor(hidden_layer_sizes=(9), 
                solver='lbfgs', max_iter=10000, random_state=1)

boston_reg.fit(train_X_sc, train_y)

# Display network structure with the final values of intercepts (Theta) and weights (W).
print('Final Intercepts for Boston Housing Neural Network Model')
print(boston_reg.intercepts_)

print()
print('Network Weights for Boston Housing Neural Network Model')
print(boston_reg.coefs_)

Final Intercepts for Boston Housing Neural Network Model
[array([ 2.26914474,  4.18675342, -1.50466092, -0.95080817,  0.32205673,
        0.49141424, -0.81237211,  0.89649621,  2.52709405]), array([-11.97658924])]

Network Weights for Boston Housing Neural Network Model
[array([[-0.36367279,  0.52687745, -0.00964011, -1.5129948 ,  1.31769872,
        -0.41850542, -1.53776369, -0.62480937, -1.90841567],
       [ 0.66134206, -1.23924591,  0.28794966,  2.11953871,  1.07434276,
         0.43220725, -4.11086338,  0.40334917, -0.08228966],
       [ 0.59676784,  0.58331396,  2.48943055,  0.90207757, -1.28237762,
         0.42433014, -1.95774694,  0.95234875,  1.23319412],
       [-2.20869382, -1.37160062, -0.98568213,  0.36329227, -0.6196563 ,
        -0.30782417,  1.05946611,  0.83457418,  2.71468783],
       [ 0.16998566, -0.09067205, -1.7444194 ,  0.11677319,  1.80120993,
         0.81352311, -0.01783085, -2.03768459,  0.23723986],
       [-0.27165058, -2.52251129, -1.29506367,  0.3880619 

### 2C.  Using the developed neural network model, make in Python predictions for the outcome variable (MVALUE) using the scaled validation predictors. Based on these predictions, develop and display in Python a table for the first five validation records that contain actual and predicted median prices (MVALUE), and their residuals. Present this table in your report.

In [9]:
# Make 'MVALUE' predictions for validation set using our Model
price_pred = np.round(boston_reg.predict(valid_X_sc), decimals=2)

price_pred_result = pd.DataFrame({'Actual': valid_y, 
                'Prediction': price_pred, 'Residual': valid_y-price_pred})

print('Predictions for MVALUE for Validation Partition')
print(price_pred_result.head(5))

Predictions for MVALUE for Validation Partition
     Actual  Prediction  Residual
307    28.2       29.63     -1.43
343    23.9       23.49      0.41
47     16.6       17.80     -1.20
67     22.0       18.73      3.27
362    20.8       25.30     -4.50


### 2D. Identify and display in Python the common accuracy measures for training and validation partitions. Provide and compare these accuracy measures in your report and assess a possibility of overfitting. Would you recommend applying this neural network model for predictions? Briefly explain.

In [10]:
# Neural network model accuracy measures for training and validation partitions. 
print('Accuracy Measures for Training Partition for Neural Network')
regressionSummary(train_y, boston_reg.predict(train_X_sc))

# Identify and display neural network accuracy measures for validation partition.
print()
print('Accuracy Measures for Validation Partition for Neural Network')
regressionSummary(valid_y, boston_reg.predict(valid_X_sc))

Accuracy Measures for Training Partition for Neural Network

Regression statistics

                      Mean Error (ME) : -0.0034
       Root Mean Squared Error (RMSE) : 1.5617
            Mean Absolute Error (MAE) : 1.1368
          Mean Percentage Error (MPE) : -0.8274
Mean Absolute Percentage Error (MAPE) : 6.0681

Accuracy Measures for Validation Partition for Neural Network

Regression statistics

                      Mean Error (ME) : -0.0912
       Root Mean Squared Error (RMSE) : 3.1675
            Mean Absolute Error (MAE) : 2.2668
          Mean Percentage Error (MPE) : -3.0502
Mean Absolute Percentage Error (MAPE) : 11.6748


### 3A. Use in Python GridSearchCV() function to identify the best number of nodes for the hidden layer in the Boston Housing neural network model. For that, consider the hidden_layer_sizes parameter in a range from 2 to 20. Provide in your report the best score and best parameter value.

In [11]:
# Identify grid search parameters. 
param_grid = {
    'hidden_layer_sizes': list(range(2, 21)), 
}

# Utilize GridSearchCV() to identify the best number of nodes in the hidden layer. 
gridSearch = GridSearchCV(MLPRegressor(solver='lbfgs', max_iter=10000, random_state=1), 
                          param_grid, cv=5, n_jobs=-1, return_train_score=True)
gridSearch.fit(train_X_sc, train_y)

# Display the best score and best parament value.
print(f'Best score:{gridSearch.best_score_:.4f}')
print('Best parameter: ', gridSearch.best_params_)

Best score:0.8759
Best parameter:  {'hidden_layer_sizes': 2}


### 3B. Train an improved neural network model using MLPRegressor() with the scaled training data set and the best identified value of the parameter from the previous question. The rest of the parameters remain the same as in model developed in 2b. Present in your report the final intercepts and network weights of the improved neural network model.

In [12]:
boston_reg_imp = MLPRegressor(hidden_layer_sizes=2, 
                solver='lbfgs', max_iter=10000, random_state=1)
boston_reg_imp.fit(train_X_sc, train_y)


print('Final Intercepts for improved Neural Network Model')
print(boston_reg_imp.intercepts_)

print()
print('Network Weights for improved Neural Network Model')
print(boston_reg_imp.coefs_)

Final Intercepts for improved Neural Network Model
[array([-5.59839276,  8.69124415]), array([6.60346469])]

Network Weights for improved Neural Network Model
[array([[-0.20769012, -1.78999663],
       [-0.31789029,  0.2530354 ],
       [ 3.68804841, -0.23745973],
       [-0.39979107, -0.36498108],
       [-1.54682674,  2.3322485 ],
       [ 0.12819544, -0.90563125],
       [ 0.31696069, -1.11355164],
       [ 3.16426975,  0.13759987],
       [ 1.58396291, -1.3793737 ],
       [-2.27344137, -0.55228335],
       [-1.32734466, -0.43485492],
       [-0.01351816,  0.17987294],
       [ 3.13443234,  1.31033442]]), array([[2.40608879],
       [1.58695947]])]


### 3C. Identify and display in Python the common accuracy measures for the training and validation partitions with the improved neural network model. Provide and compare these accuracy measures in your report and assess a possibility of overfitting. Would you recommend applying this neural network model for predictions? Briefly explain.

In [13]:
# Neural network model accuracy measures for training and validation partitions with improved Model 
print('Accuracy Measures for Training Partition for Neural Network')
regressionSummary(train_y, boston_reg_imp.predict(train_X_sc))

# Identify and display neural network accuracy measures for validation partition.
print()
print('Accuracy Measures for Validation Partition for Neural Network')
regressionSummary(valid_y, boston_reg_imp.predict(valid_X_sc))

Accuracy Measures for Training Partition for Neural Network

Regression statistics

                      Mean Error (ME) : 0.0013
       Root Mean Squared Error (RMSE) : 2.6108
            Mean Absolute Error (MAE) : 2.0053
          Mean Percentage Error (MPE) : -1.8353
Mean Absolute Percentage Error (MAPE) : 10.4838

Accuracy Measures for Validation Partition for Neural Network

Regression statistics

                      Mean Error (ME) : 0.0295
       Root Mean Squared Error (RMSE) : 3.0570
            Mean Absolute Error (MAE) : 2.2651
          Mean Percentage Error (MPE) : -2.3393
Mean Absolute Percentage Error (MAPE) : 11.4870
