In [None]:
import numpy as np
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from IPython.display import HTML

Step 1:Data Preprocessing

In [None]:
#Load the dataset
boston=load_boston()
print(boston.DESCR)

In [None]:
#Put the data into pandas Dataframes
features=pd.DataFrame(boston.data,columns=boston.feature_names)
print(features)

In [None]:
#basic select feature age
features['AGE']

In [None]:
target=pd.DataFrame(boston.target,columns=['target'])
print(target)

In [None]:
#lets find out the minimum and maximum price of the house in boston
max(target['target'])

In [None]:
min(target['target'])

In [None]:
#concatenate the features and target ino a single datag=frame, axis=1 makeit concatenate column wise
df=pd.concat([features,target],axis=1)
df

In [None]:
#Data visualization of Bh data
df.describe().round(decimals=2)

In [None]:
#calculate of correlation between every column on the data
corr=df.corr('pearson')

In [None]:
#Take absolute values of correlation
corrs=[abs(corr[attr]['target']) for attr in list(features)]

In [None]:
#make a list of pairs[(corrs,features)]
l=list(zip(corrs,list(features)))

In [None]:
#sort the list of pairs in reverse/descending order
#with the correlation values as the key for sorting
l.sort(key=lambda x:x[0],reverse=True)

In [None]:
corrs,labels=list(zip((*l)))

In [None]:
#plot the correlation with respect to the target variable s a bar graph
index=np.arange(len(labels))
plt.figure(figsize=(15,5))
plt.bar(index,corrs,width=0.5)
plt.xlabel('Attributes')
plt.ylabel('correlation with the target variable')
plt.xticks(index,labels)
plt.show()

In [None]:
#we observe from the bar graph generated above , the LSTAT and and RM have the two hoghest absolute correltion values
#Normalize the BH DATA
X=df['LSTAT'].values
Y=df['target'].values

In [None]:
#berfore normalization of data
print(Y[:5])

In [None]:
#perform normalization
x_scaler=MinMaxScaler()
X=x_scaler.fit_transform(X.reshape(-1,1))
y_scaler=MinMaxScaler()
Y=y_scaler.fit_transform(Y.reshape(-1,1))

In [None]:
#after normalization
print(Y[:5])

Step-2 Define error
Means squared erroe

In [None]:
def error(m,x,c,t):
    N=x.size
    e=sum(((m*x+c)-t)**2)
    return e*1/(2*N)

Step-3 Split the boston house dataset

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2)

In [None]:
#define update function
def update(m,x,c,t,learning_rate):
    grad_m=sum(2*((m*x+c)-t)*x)
    grad_c=sum(2*((m*x+c)-t))
    m=m-grad_m*learning_rate
    c=c-grad_c*learning_rate
    return m,c

In [None]:
#define gradient descent funtion
def gradient_descent(init_m,init_c,x,t,learning_rate,iterations,error_threshold):
    m=init_m
    c=init_c
    error_values=list()
    mc_values=list()
    for i in range(iterations):
        e = error(m,x,c,t)
        if e < error_threshold:
            print("Error less than the threshold .stopping gradient descent")
            break
        error_values.append(e)
        m,c=update(m,x,c,t,learning_rate)
        mc_values.append((m,c))
    return m,c,error_values,mc_values

In [None]:
#Running descent function
#%%time
init_m=0.9
init_c=0
learning_rate=0.001
iterations=250
error_threshold=0.001
m,c,error_values,mc_values=gradient_descent(init_m,init_c,xtrain,ytrain,learning_rate,iterations,error_threshold)


In [None]:
#Model Training Visualization
mc_values_anim=mc_values[0:250:5]

In [None]:
fig,ax=plt.subplots()
ln,=plt.plot([],[],'ro-',animated=True)

def init():
    plt.scatter(xtest,ytest,color='g')
    ax.set_xlim(0,1.0)
    ax.set_ylim(0,1.0)
    return ln,

def update_frame(frame):
    m,c=mc_values_anim[frame]
    x1,y1=-0.5, m * -.5 + c
    x2,y2=1.5, m * 1.5 + c
    ln.set_data([x1,x2],[y1,y2])
    return ln,
anim=FuncAnimation(fig, update_frame, frames=range(len(mc_values_anim)),init_func=init,blit=True)
HTML(anim.to_html5_video())


In [None]:
#Error visualization
#plotting tge regression line upon the training dataset
plt.scatter(xtrain, ytrain,color='b')
plt.plot(xtrain, (m * xtrain + c),color='r')

In [None]:
plt.plot(np.arange(len(error_values)),error_values)
plt.ylabel('error')
plt.xlabel('Iteration')

Prediction


In [None]:
#calculate the prediction on the test set as a vectorized operation
predicted=(m*xtest)+c

In [None]:
#compute MSE for the predicted values on the testing set
mean_squared_error(ytest,predicted)

In [None]:
p=pd.DataFrame(list(zip(xtest,ytest,predicted)),columns=['x','target_y','predicted_y'])
p.head()

In [None]:
#plot the predicted values gainst the target values
plt.scatter(xtest,ytest,color='b')
plt.plot(xtest,predicted,color='r')

Revert normalization to obtain the predicted price of the house in $1000s

In [None]:
predicted=predicted.reshape(-1,1)
xtest=xtest.reshape(-1,1)
ytest=ytest.reshape(-1,1)


xtest_scaled=x_scaler.inverse_transform(xtest)
ytest_scaled=y_scaler.inverse_transform(ytest)
predicted_scaled=y_scaler.inverse_transform(predicted)

#This is to remove extra dimension

xtest_scaled=xtest_scaled[:,-1]
ytest_scaled=ytest_scaled[:,-1]
predicted_scaled=predicted_scaled[:,-1]
p=pd.DataFrame(list(zip(xtest_scaled,ytest_scaled,predicted_scaled)),columns=['x','target_y','predicted_y'])
p=p.round(decimals=2)
p.head()