In [None]:
# Importing the libraries 
import pandas as pd
import numpy as np
from sklearn import metrics

In [None]:
#importing boston dataset
from sklearn.datasets import load_boston
boston = load_boston()

In [None]:
#initialising the data Frame
df=pd.DataFrame(boston.data)

In [None]:
#seeing the dataset roughly
df.head(8)

In [None]:
#adding the names of features with respective data
df.columns=boston.feature_names
#Adding target variable to dataframe


In [None]:
#cheking columns before adding features for target value i.e price
df.shape

In [None]:
df['PRICE'] = boston.target 
# Median value of owner-occupied homes in $1000s
df.head()

In [None]:
#checking columns after adding target values
df.shape

In [None]:
# all datas are properly associated with their types
df.dtypes

In [None]:
#Analysing the data.
#Statistics of dataset described.
df.describe()

In [None]:
#So no data is missing since all sums are 0.
df.isnull().sum()

In [None]:
#checking outliers using boxplot
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10))
count = 0
axs = axs.flatten()
for tar,var in df.items():
    sns.boxplot(y=tar, data=df, ax=axs[count])
    count= count+1
plt.tight_layout(pad=0.5, w_pad=0.78, h_pad=4.0)


In [None]:
#outliers in percentage
for tar,var in df.items():
    q1=var.quantile(0.25)
    q3=var.quantile(0.75)
    iqr=q3-q1
    var_col=var[(var<=q1-1.5*iqr) | (var>=q3+1.5*iqr)]
    perc=np.shape(var_col)[0]*100.0/np.shape(df)[0]
    print("Column %s outliers = %.2f%%" % (tar, perc))             
                  

In [None]:
#checking the correlation between two features.
corr=df.corr()
corr

In [None]:
#using a heatmap to see correlation between features more clearly.
plt.figure(figsize=(20,20))
sns.heatmap(corr.abs(), annot=True,cmap='Greens')

In [None]:
#Checking the skewness in data
fig,axs = plt.subplots(ncols=7, nrows=2, figsize=(24,12))
count = 0
axs = axs.flatten()
for tar,var in df.items():
    sns.distplot(var,ax=axs[count])
    count = count+1
plt.tight_layout(pad=0.5, w_pad=0.6, h_pad=5.0)

In [None]:
# Spliting target variable and independent variables
X = df.drop(['PRICE'], axis = 1)
y = df['PRICE']

In [None]:
X

In [None]:
#splitting the data to train and test. checking the validation of the model.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state = 10)

<h1>1. LINEAR REGRESSION</h1>

In [None]:
# Import library for Linear Regression
from sklearn.linear_model import LinearRegression

# Create a Linear regressor
lm = LinearRegression()

# Train the model using the training sets 
lm.fit(X_train, y_train)

In [None]:
# Value of y intercept
lm.intercept_

In [None]:
#Converting the coefficient values to a dataframe
coeffcients = pd.DataFrame([X_train.columns,lm.coef_]).T
coeffcients = coeffcients.rename(columns={0: 'Attribute', 1: 'Coefficients'})
coeffcients

In [None]:
#predicting on training data
y_pred=lm.predict(X_train)
#Model Evaluation and error calculations
print('R^2 =',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2 =',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE =',metrics.mean_absolute_error(y_train, y_pred))
print('MSE =',metrics.mean_squared_error(y_train, y_pred))
print('RMSE =',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

<h2>Model Validation</h2>

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Price vs Predicted Price")
plt.show()

In [None]:
#Plotting Actual observations vs predicted observations
import matplotlib.pyplot as plt 
import seaborn as sns
f = plt.figure(figsize=(14,5))
ax = f.add_subplot(121)
sns.scatterplot(y_train,y_pred,ax=ax,color='r')
ax.set_title('Actual Vs Predicted value')

# Check for Residual normality & mean
ax = f.add_subplot(122)
a=(y_train - y_pred)
sns.distplot(a,ax=ax,color='b')
ax.axvline(a.mean(),color='k',linestyle='--')
ax.set_title('Check for Residual normality & mean: \n Residual eror');

In [None]:
#Check for Multicollinearity
#Variance Inflation Factor
R_square = lm.score(X_test,y_test)
VIF_LR = 1/(1- R_square)
VIF_LR

<h2> Predicting ML model on test data.</h2>

In [None]:
#predicting the data using above model
y_tpred= lm.predict(X_test)
#Model Evaluation
tpred_linreg = metrics.r2_score(y_test, y_tpred)
print('R^2:', tpred_linreg)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_tpred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_tpred))
print('MSE:',metrics.mean_squared_error(y_test, y_tpred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_tpred)))

<h1>2. SVM REGRESSION</h1>

In [None]:
#Standardising the data 
from sklearn.preprocessing import StandardScaler
ss= StandardScaler()
X_train= ss.fit_transform(X_train)
X_test= ss.transform(X_test)


In [None]:
#importing SVM regressor
from sklearn import svm
reg= svm.SVR()

#training the model
reg.fit(X_train,y_train)

In [None]:
#Predicting the model on train data
y_pred= reg.predict(X_train)

In [None]:
# Model Evaluation and error calculations
print('R^2 =',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2 =',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE =',metrics.mean_absolute_error(y_train, y_pred))
print('MSE =',metrics.mean_squared_error(y_train, y_pred))
print('RMSE =',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

<h2>Model Validation</h2>

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Price vs Predicted Price")
plt.show()

In [None]:
#Plotting Actual observations vs predicted observations
import matplotlib.pyplot as plt 
import seaborn as sns
f = plt.figure(figsize=(14,5))
ax = f.add_subplot(121)
sns.scatterplot(y_train,y_pred,ax=ax,color='r')
ax.set_title('Actual Vs Predicted value')

# Check for Residual normality & mean
ax = f.add_subplot(122)
a=(y_train - y_pred)
sns.distplot(a,ax=ax,color='b')
ax.axvline(a.mean(),color='k',linestyle='--')
ax.set_title('Check for Residual normality & mean: \n Residual eror')

In [None]:
#Check for Multicollinearity using Variance Inflation Factor
R_square = lm.score(X_test,y_test)
VIF_SVR = 1/(1- R_square)
VIF_SVR

<h2> Predicting ML model on test data.</h2>

In [None]:
#predicting the data using our test model
y_tpred= reg.predict(X_test)
#Model Evaluation
tpred_svm = metrics.r2_score(y_test, y_tpred)
print('R^2:', tpred_svm)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_tpred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_tpred))
print('MSE:',metrics.mean_squared_error(y_test, y_tpred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_tpred)))

<h1>3. RANDOM FOREST REGRESSOR</h1>

In [None]:
#importing the dataset
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor()
rfr.fit(X_train,y_train)

In [None]:
#Predicting the model
y_pred=rfr.predict(X_train)

In [None]:
# Model Evaluation
print('R^2:',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_train, y_pred))
print('MSE:',metrics.mean_squared_error(y_train, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

Adjusted R^2 value is very good.

<h2>Model Validation</h2>

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Price vs Predicted Price")
plt.show()

In [None]:
#Plotting Actual observations vs predicted observations
import matplotlib.pyplot as plt 
import seaborn as sns
f = plt.figure(figsize=(14,5))
ax = f.add_subplot(121)
sns.scatterplot(y_train,y_pred,ax=ax,color='g')
ax.set_title('Actual Vs Predicted value')
# Check for Residual normality & mean
ax = f.add_subplot(122)
a=(y_train - y_pred)
sns.distplot(a,ax=ax,color='b')
ax.axvline(a.mean(),color='k',linestyle='--')
ax.set_title('Check for Residual normality & mean: \n Residual eror')


In [None]:
#Check for Multicollinearity using Variance Inflation Factor
R_square=rfr.score(X_test,y_test)
VIF_RFR = 1/(1-R_square)
VIF_RFR

<h2> Predicting ML model on test data.</h2>

In [None]:
#predicting the data using above model
y_tpred= rfr.predict(X_test)
#Model Evaluation
tpred_rfr = metrics.r2_score(y_test, y_tpred)
print('R^2:',tpred_rfr)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_tpred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_tpred))
print('MSE:',metrics.mean_squared_error(y_test, y_tpred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_tpred)))

<h1>4. XGBOOOST REGRESSOR</h1>

In [None]:
#importing XGBOOST regression library
from xgboost import XGBRegressor
#
xgbr= XGBRegressor()
#Training the model
xgbr.fit(X_train, y_train)

In [None]:
#predicting the model
y_pred=xgbr.predict(X_train)

In [None]:
# Model Evaluation and error calculations
print('R^2 =',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2 =',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE =',metrics.mean_absolute_error(y_train, y_pred))
print('MSE =',metrics.mean_squared_error(y_train, y_pred))
print('RMSE =',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

<h2>Model Validation</h2>

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Price vs Predicted Price")
plt.show()

In [None]:
#Plotting Actual observations vs predicted observations
import matplotlib.pyplot as plt 
import seaborn as sns
f = plt.figure(figsize=(14,5))
ax = f.add_subplot(121)
sns.scatterplot(y_train,y_pred,ax=ax,color='g')
ax.set_title('Actual Vs Predicted value')

# Check for Residual normality & mean
ax = f.add_subplot(122)
a=(y_train - y_pred)
sns.distplot(a,ax=ax,color='b')
ax.axvline(a.mean(),color='k',linestyle='--')
ax.set_title('Check for Residual normality & mean: \n Residual eror')


In [None]:
#check for Multicollinearity using Variance Inflation Factor
R_square=xgbr.score(X_test,y_test)
VIF_XGBR = 1/(1-R_square)
VIF_XGBR

<h2> Predicting ML model on test data.</h2>

In [None]:
#predicting the data using above model
y_tpred= xgbr.predict(X_test)
#Model Evaluation
tpred_xgbr = metrics.r2_score(y_test, y_tpred)
print('R^2:',tpred_xgbr)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_tpred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_tpred))
print('MSE:',metrics.mean_squared_error(y_test, y_tpred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_tpred)))

<h1> Choosing the best model</h1>

In [None]:
models = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost', 'Support Vector Machines'],
    'R-squared Score': [tpred_linreg*100, tpred_rfr*100, tpred_xgbr*100, tpred_svm*100]})
models.sort_values(by='R-squared Score', ascending=False)