# Predicting Boston House Price
### Dataset Link: https://www.kaggle.com/code/prasadperera/the-boston-housing-dataset

### Code Tools

In [2]:
import warnings; warnings.filterwarnings('ignore')
import pandas as pd,numpy as np,sqlite3,os
import seaborn as sn,pylab as pl
import keras as ks,tensorflow as tf
from IPython import display
from sklearn.model_selection import train_test_split
from sklearn import datasets,linear_model,svm
from sklearn.metrics import mean_squared_error,median_absolute_error,\
mean_absolute_error,r2_score,explained_variance_score
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.ensemble import BaggingRegressor,RandomForestRegressor,\
AdaBoostRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor,RadiusNeighborsRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,\
QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.kernel_ridge import KernelRidge
from sklearn.cross_decomposition import PLSRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel,RationalQuadratic,RBF
from sklearn.semi_supervised import LabelPropagation,LabelSpreading
from sklearn.isotonic import IsotonicRegression
from keras.datasets import boston_housing
from keras.callbacks import ModelCheckpoint,EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential,load_model
from keras.layers import Dense,LSTM,GlobalAveragePooling1D
from keras.layers import Activation,Flatten,Dropout,BatchNormalization
from keras.layers import Conv1D,MaxPooling1D,GlobalMaxPooling1D
from tensorflow.keras.layers import LeakyReLU, PReLU, ELU, ThresholdedReLU
fw='weights.boston.hdf5'

In [None]:
def connect_to_db(dbf):
    sqlconn=None
    try:
        sqlconn=sqlite3.connect(dbf)
        return sqlconn
    except Error as err:
        print(err)
        if sqlconn is not None:
            sqlconn.close()
def history_plot(fit_history,n):
    pl.figure(figsize=(12,10))    
    pl.subplot(211)
    pl.plot(fit_history.history['loss'][n:],
            color='slategray',label='train')
    pl.plot(fit_history.history['val_loss'][n:],
            color='#348ABD',label='valid')
    pl.xlabel('Epochs'); pl.ylabel('Loss')
    pl.legend(); pl.title('Loss Function')      
    pl.subplot(212)
    pl.plot(fit_history.history['mae'][n:],
            color='slategray',label='train')
    pl.plot(fit_history.history['val_mae'][n:],
            color='#348ABD',label='valid')
    pl.xlabel('Epochs'); pl.ylabel('MAE')    
    pl.legend(); pl.title('Mean Absolute Error')
    pl.show()
def nnpredict(y1,y2,y3,ti):
    pl.figure(figsize=(12,6))
    pl.scatter(range(n),y_test[:n],marker='*',s=100,
               color='black',label='Real data')
    pl.plot(y1[:n],label='MLP')
    pl.plot(y2[:n],label='CNN')
    pl.plot(y3[:n],label='RNN')
    pl.xlabel("Data Points")
    pl.ylabel("Predicted and Real Target Values")
    pl.legend(); pl.title(ti); pl.show()

### Data Loading and Exploration

In [None]:
connection=connect_to_db('boston.db')
if connection is not None:
    cursor=connection.cursor()
boston_data=datasets.load_boston()
columns=boston_data.feature_names
boston_df=pd.DataFrame(boston_data.data,columns=columns)
boston_df['MEDV']=boston_data.target
boston_df.to_sql('main',con=connection,if_exists='replace')
boston_df.head()

In [None]:
pearson=boston_df.corr(method='pearson')
corr_with_prices=pearson.iloc[-int(1)][:-int(1)]
pd.DataFrame(corr_with_prices[abs(corr_with_prices)\
             .argsort()[::-int(1)]])

In [None]:
pd.read_sql_query('''
SELECT ZN,
       AVG(LSTAT),
       AVG(RM),
       AVG(PTRATIO),
       AVG(INDUS),
       AVG(TAX)
FROM main
GROUP BY ZN;
''',con=connection)\
.set_index('ZN').head(int(7))

In [None]:
if connection is not None:
    connection.close()
if os.path.exists('boston.db'):
    os.remove('boston.db')
else:
    print('The file does not exist')
os.listdir()

In [None]:
n=int(51)
(x_train,y_train),(x_test,y_test)=boston_housing.load_data()
x_valid,y_valid=x_test[:n],y_test[:n]
x_test,y_test=x_test[n:],y_test[n:]
t=[["Training feature's shape:",x_train.shape],
   ["Training target's shape",y_train.shape],
   ["Validating feature's shape:",x_valid.shape],
   ["Validating target's shape",y_valid.shape],
   ["Testing feature's shape:",x_test.shape],
   ["Testing target's shape",y_test.shape]]
pd.DataFrame(t)

In [None]:
pl.style.use('seaborn-whitegrid')
pl.figure(1,figsize=(10,4))
pl.subplot(121)
sn.distplot(y_train,color='#348ABD',bins=30)
pl.ylabel("Distribution"); pl.xlabel("Prices")
pl.subplot(122)
sn.distplot(np.log(y_train),color='#348ABD',bins=30)
pl.ylabel("Distribution"); pl.xlabel("Logarithmic Prices")
pl.suptitle('Boston Housing Data',fontsize=15)
pl.show()

### Building Neural Networks with Keras Py

In [None]:
def mlp_model():
    model=Sequential() 
    model.add(Dense(832,input_dim=13))
    model.add(LeakyReLU(alpha=.025))   
    model.add(Dense(104))     
    model.add(LeakyReLU(alpha=.025))   
    model.add(Dense(1,kernel_initializer='normal'))    
    model.compile(loss='mse',optimizer='rmsprop',metrics=['mae'])
    return model
mlp_model=mlp_model()
checkpointer=ModelCheckpoint(filepath=fw,verbose=2,save_best_only=True)
lr_reduction=ReduceLROnPlateau(monitor='val_loss',patience=10,
                               verbose=0,factor=.75)
estopping=EarlyStopping(monitor='val_loss',patience=20,verbose=2)
history=mlp_model.fit(x_train,y_train,batch_size=24, 
                      validation_data=(x_valid,y_valid),
                      epochs=1000,verbose=2, 
                      callbacks=[checkpointer,lr_reduction,estopping])

In [None]:
history_plot(history,2)
mlp_model.load_weights(fw)
y_train_mlp=mlp_model.predict(x_train)
y_valid_mlp=mlp_model.predict(x_valid)
y_test_mlp=mlp_model.predict(x_test)
score_train_mlp=r2_score(y_train,y_train_mlp)
score_valid_mlp=r2_score(y_valid,y_valid_mlp)
score_test_mlp=r2_score(y_test,y_test_mlp)
pd.DataFrame([['Train R2 score:',score_train_mlp],
              ['Valid R2 score:',score_valid_mlp],
              ['Test R2 score:',score_test_mlp]])

In [None]:
def cnn_model():
    model=Sequential()       
    model.add(Conv1D(13,5,padding='valid',
                     input_shape=(13,1)))
    model.add(LeakyReLU(alpha=.025))
    model.add(MaxPooling1D(pool_size=2))   
    model.add(Conv1D(128,3,padding='valid'))
    model.add(LeakyReLU(alpha=.025))
    model.add(MaxPooling1D(pool_size=2))   
    model.add(Flatten())      
    model.add(Dense(26,activation='relu',
                    kernel_initializer='normal'))
    model.add(Dropout(.1))  
    model.add(Dense(1,kernel_initializer='normal'))  
    model.compile(loss='mse',optimizer='nadam',metrics=['mae'])
    return model
cnn_model=cnn_model()
checkpointer=ModelCheckpoint(filepath=fw,verbose=2,save_best_only=True)
lr_reduction=ReduceLROnPlateau(monitor='val_loss',patience=10,
                               verbose=0,factor=.75)
estopping=EarlyStopping(monitor='val_loss',patience=20,verbose=2)
history=cnn_model.fit(x_train.reshape(-1,13,1),y_train, 
                          validation_data=(x_valid.reshape(-1,13,1),y_valid),
                          epochs=1000,batch_size=14,verbose=2, 
                          callbacks=[checkpointer,lr_reduction,estopping])

In [None]:
history_plot(history,2)
cnn_model.load_weights(fw)
y_train_cnn=cnn_model.predict(x_train.reshape(-1,13,1))
y_valid_cnn=cnn_model.predict(x_valid.reshape(-1,13,1))
y_test_cnn=cnn_model.predict(x_test.reshape(-1,13,1))
score_train_cnn=r2_score(y_train,y_train_cnn)
score_valid_cnn=r2_score(y_valid,y_valid_cnn)
score_test_cnn=r2_score(y_test,y_test_cnn)
pd.DataFrame([['Train R2 score:',score_train_cnn],
              ['Valid R2 score:',score_valid_cnn],
              ['Test R2 score:',score_test_cnn]])

In [None]:
def rnn_model():
    model=Sequential()   
    model.add(LSTM(104,return_sequences=True,
                   input_shape=(1,13)))
    model.add(LSTM(104,return_sequences=True))
    model.add(LSTM(104,return_sequences=False))   
    model.add(Dense(1))
    model.compile(optimizer='rmsprop',loss='mse',metrics=['mae'])       
    return model
rnn_model=rnn_model()
checkpointer=ModelCheckpoint(filepath=fw,verbose=2,save_best_only=True)
lr_reduction=ReduceLROnPlateau(monitor='val_loss',patience=10,
                               verbose=0,factor=.75)
estopping=EarlyStopping(monitor='val_loss',patience=20,verbose=2)
history=rnn_model.fit(x_train.reshape(-1,1,13),y_train, 
                          validation_data=(x_valid.reshape(-1,1,13),y_valid),
                          epochs=1000,batch_size=16,verbose=2, 
                          callbacks=[checkpointer,lr_reduction,estopping])

In [None]:
history_plot(history,2)
rnn_model.load_weights(fw)
y_train_rnn=rnn_model.predict(x_train.reshape(-1,1,13))
y_valid_rnn=rnn_model.predict(x_valid.reshape(-1,1,13))
y_test_rnn=rnn_model.predict(x_test.reshape(-1,1,13))
score_train_rnn=r2_score(y_train,y_train_rnn)
score_valid_rnn=r2_score(y_valid,y_valid_rnn)
score_test_rnn=r2_score(y_test,y_test_rnn)
pd.DataFrame([['Train R2 score:',score_train_rnn],
              ['Valid R2 score:',score_valid_rnn],
              ['Test R2 score:',score_test_rnn]])

In [None]:
ti="Train Set; Neural Network Predictions vs Real Data"
y1,y2,y3=y_train_mlp,y_train_cnn,y_train_rnn
nnpredict(y1,y2,y3,ti)
ti="Validation Set; Neural Network Predictions vs Real Data"
y1,y2,y3=y_valid_mlp,y_valid_cnn,y_valid_rnn
nnpredict(y1,y2,y3,ti)
ti="Test Set; Neural Network Predictions vs Real Data"
y1,y2,y3=y_test_mlp,y_test_cnn,y_test_rnn
nnpredict(y1,y2,y3,ti)