In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

# Loading the boston house pricing dataset

In [4]:
from sklearn.datasets import fetch_openml 
boston = fetch_openml(name='boston', version=1)

In [5]:
print(type(boston))

<class 'sklearn.utils._bunch.Bunch'>


In [6]:
## checking the decription of the dataset
print(boston.DESCR)

**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.
Variables in order:
CRIM     per capita crime rate by town
ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS    proportion of non-retail business acres per town
CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX      nitric oxides concentration (parts per 10 million)
RM       average number of rooms per dwelling
AGE      proportion of owner-occupied units built prior to 1940
DIS      weighted distances to five Boston employment centres
RAD      index of accessibility to radial highways
TAX      full-value property-tax rate per $10

In [7]:
boston.data = boston.data.to_numpy()
print(boston.data)

[[0.00632 18.0 2.31 ... 15.3 396.9 4.98]
 [0.02731 0.0 7.07 ... 17.8 396.9 9.14]
 [0.02729 0.0 7.07 ... 17.8 392.83 4.03]
 ...
 [0.06076 0.0 11.93 ... 21.0 396.9 5.64]
 [0.10959 0.0 11.93 ... 21.0 393.45 6.48]
 [0.04741 0.0 11.93 ... 21.0 396.9 7.88]]


In [8]:
print(boston.target)

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64


In [9]:
print(boston.feature_names)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


# preparing the dataset

In [10]:
dataset = pd.DataFrame(boston.data)
# dataset.index = dataset.index.astype(str)

In [11]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [12]:
dataset['PRICE']=boston.target

In [13]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       506 non-null    object 
 1   1       506 non-null    object 
 2   2       506 non-null    object 
 3   3       506 non-null    object 
 4   4       506 non-null    object 
 5   5       506 non-null    object 
 6   6       506 non-null    object 
 7   7       506 non-null    object 
 8   8       506 non-null    object 
 9   9       506 non-null    object 
 10  10      506 non-null    object 
 11  11      506 non-null    object 
 12  12      506 non-null    object 
 13  PRICE   506 non-null    float64
dtypes: float64(1), object(13)
memory usage: 55.5+ KB


In [15]:
## Summarizing the data
dataset.describe()

Unnamed: 0,PRICE
count,506.0
mean,22.532806
std,9.197104
min,5.0
25%,17.025
50%,21.2
75%,25.0
max,50.0


In [16]:
## check the missing values
dataset.isnull().sum()

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
PRICE    0
dtype: int64

# EDA

In [17]:
## correlation
dataset.corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,PRICE
0,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621,-0.388305
1,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445
2,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038,-0.483725
3,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929,0.17526
4,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321
5,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536
6,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339,-0.376955
7,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929
8,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626
9,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536


In [18]:
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

In [None]:
##sns.pairplot(dataset)

: 

In [None]:
# sns.regplot(x="RM",y="PRICE",data=dataset)

: 

In [None]:
# sns.regplot(x="LSTAT",y="PRICE",data=dataset)

: 

In [None]:
## Independent and dependent features
X=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]

: 

In [None]:
# train test split


: 

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

: 

In [None]:
## standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

: 

In [None]:
print(X_train)

: 

In [None]:
X_train=scaler.fit_transform(X_train)

: 

In [None]:
X_test=scaler.transform(X_test)

: 

# Model training

In [None]:
from sklearn.linear_model import LinearRegression

: 

In [None]:
reg = LinearRegression()

: 

In [None]:
reg.fit(X_train,y_train)

: 

In [None]:
print(reg.coef_)

: 

In [None]:
print(reg.intercept_)

: 

In [None]:
## on which parameters model is trained
reg.get_params()

: 

In [None]:
##prediction with test data
reg_pred = reg.predict(X_test)

: 

In [None]:
## ploat a scatter plot 
plt.scatter(y_test,reg_pred)

: 

In [None]:
residuals = y_test - reg_pred

: 

In [None]:
residuals


: 

In [None]:
## plot the residuals
sns.displot(residuals,kind="kde")

: 

In [None]:
# scatter plot with respect to prediction and residuals
#uniform distribution
plt.scatter(reg_pred,residuals)

: 

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

: 

In [None]:
print(mean_absolute_error(y_test,reg_pred))
print(mean_squared_error(y_test,reg_pred))


: 

# Rsquare and adjusted R square

R^2 = 1-SSR/sst

Adjusted R@ = 1-[(1-R2)*(n-1)/(n-k-1)]

In [None]:
from sklearn.metrics import r2_score

: 

In [None]:
score = r2_score(y_test,reg_pred)
print(score)

: 

In [None]:
1-(1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

: 

## New Data Prediction

In [None]:
boston.data[0].shape

: 

In [None]:
boston.data[0].reshape(1,-1)

: 

In [None]:
## transformation of new data
reg.predict(scaler.transform(boston.data[0].reshape(1,-1)))

: 

## pickling the model file for deployment

In [None]:
import pickle

: 

In [None]:
pickle.dump(reg,open('regmodel.pkl','wb'))

: 

In [None]:
pickled_model = pickle.load(open('regmodel.pkl','rb'))

: 

In [None]:
pickled_model.predict(scaler.transform(boston.data[0].reshape(1,-1)))

: 

: 