## Importing all the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
%matplotlib inline

## Load data

In [None]:
boston=pd.read_csv("BostonHousing.csv")

## Dataset columns & descriptions

In [None]:
boston.head()

In [None]:
boston.describe()

## Removing null values

In [None]:
boston.dropna(inplace=True)

In [None]:
boston.describe()

## Data Visualization

In [None]:
sns.regplot(x='rm',y='medv',data=boston,fit_reg=True)
plt.title("Relationship between rooms and price")
plt.show()

In [None]:
sns.regplot(y='medv',x='lstat',data=boston,fit_reg=True)
plt.show()

In [None]:
sns.regplot(y='medv',x='nox',data=boston,fit_reg=True)
plt.show()

In [None]:
sns.regplot(y='medv',x='dis',data=boston,fit_reg=True)
plt.show()

In [None]:
sns.regplot(y='medv',x='ptratio',data=boston,fit_reg=True)
plt.show()

In [None]:
sns.regplot(y='medv',x='crim',data=boston,fit_reg=True)
plt.show()

In [None]:
plt.hist(np.log(boston.crim))

In [None]:
plt.hist(np.log(boston.rm))

In [None]:
plt.hist(np.log(boston.lstat))

In [None]:
plt.hist(np.log(boston.nox))

In [None]:
plt.hist(np.log(boston.dis))

In [None]:
plt.hist(np.log(boston.ptratio))

In [None]:
boston.columns

In [None]:
boston_selected_var_df=boston.iloc[:,[0,4,5,7,10,12]]

In [None]:
boston_selected_var_df.head()

In [None]:
boston_selected_var_df['crim']=np.log(boston_selected_var_df.crim)

In [None]:
boston_selected_var_df.head()

In [None]:
boston_selected_var_df['price']=boston.medv

In [None]:
boston_selected_var_df.corr()

In [None]:
boston_selected_var_df.head()

## Model Building

In [None]:
model=ols('price ~ crim + nox + rm + dis + ptratio + lstat',boston_selected_var_df).fit()

In [None]:
print(model.summary())

In [None]:
predicted_prices=model.fittedvalues

In [None]:
predicted_prices

In [None]:
plt.scatter(boston_selected_var_df.price,predicted_prices)

## Mean Squared Error

In [None]:
from sklearn.metrics import mean_squared_error
error=np.sqrt(mean_squared_error(boston_selected_var_df.price,predicted_prices))
error

## Splitting the data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split
x=boston_selected_var_df.drop('price',axis=1)
y=boston_selected_var_df['price']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3,random_state=3)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
LinReg=LinearRegression()
LinReg.fit(x_train,y_train)

In [None]:
y_pred=LinReg.predict(x_test)

In [None]:
plt.scatter(y_test,y_pred)

In [None]:
np.sqrt(mean_squared_error(y_test,y_pred))