#### For the project, a training dataset with real estate prices in Boston was used

In [1]:
from sklearn.datasets import load_boston
import pandas as pd

In [2]:
data = load_boston()

In [12]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MEDV'] = data.target

In [18]:
#dataset information
print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [14]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### checking for missing values

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


no missing values

### Split into train and test samples

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [21]:
# It is necessary to separate the columns with features from the column with the target variable
X = df[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT']]
y = df['MEDV']

In [24]:
# Split data into training part and validation part
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
X_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
264,0.55007,20.0,3.97,0.0,0.6470,7.206,91.6,1.9301,5.0,264.0,13.0,387.89,8.10
167,1.80028,0.0,19.58,0.0,0.6050,5.877,79.2,2.4259,5.0,403.0,14.7,227.61,12.14
24,0.75026,0.0,8.14,0.0,0.5380,5.924,94.1,4.3996,4.0,307.0,21.0,394.33,16.30
432,6.44405,0.0,18.10,0.0,0.5840,6.425,74.8,2.2004,24.0,666.0,20.2,97.95,12.03
45,0.17142,0.0,6.91,0.0,0.4480,5.682,33.8,5.1004,3.0,233.0,17.9,396.90,10.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,1.13081,0.0,8.14,0.0,0.5380,5.713,94.1,4.2330,4.0,307.0,21.0,360.17,22.60
279,0.21038,20.0,3.33,0.0,0.4429,6.812,32.2,4.1007,5.0,216.0,14.9,396.90,4.85
246,0.33983,22.0,5.86,0.0,0.4310,6.108,34.9,8.0555,7.0,330.0,19.1,390.18,9.16
395,8.71675,0.0,18.10,0.0,0.6930,6.471,98.8,1.7257,24.0,666.0,20.2,391.98,17.12


In [26]:
X_test

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
303,0.10000,34.0,6.09,0.0,0.4330,6.982,17.7,5.4917,7.0,329.0,16.1,390.43,4.86
204,0.02009,95.0,2.68,0.0,0.4161,8.034,31.9,5.1180,4.0,224.0,14.7,390.55,2.88
186,0.05602,0.0,2.46,0.0,0.4880,7.831,53.6,3.1992,3.0,193.0,17.8,392.63,4.45
417,25.94060,0.0,18.10,0.0,0.6790,5.304,89.1,1.6475,24.0,666.0,20.2,127.36,26.64
253,0.36894,22.0,5.86,0.0,0.4310,8.259,8.4,8.9067,7.0,330.0,19.1,396.90,3.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,9.92485,0.0,18.10,0.0,0.7400,6.251,96.6,2.1980,24.0,666.0,20.2,388.52,16.44
15,0.62739,0.0,8.14,0.0,0.5380,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47
396,5.87205,0.0,18.10,0.0,0.6930,6.405,96.0,1.6768,24.0,666.0,20.2,396.90,19.37
169,2.44953,0.0,19.58,0.0,0.6050,6.402,95.2,2.2625,5.0,403.0,14.7,330.04,11.32


In [27]:
y_train

264    36.5
167    23.8
24     15.6
432    16.1
45     19.3
       ... 
30     12.7
279    35.1
246    24.3
395    13.1
371    50.0
Name: MEDV, Length: 404, dtype: float64

In [28]:
y_test

303    33.1
204    50.0
186    50.0
417    10.4
253    42.8
       ... 
447    12.6
15     19.9
396    12.5
169    22.3
131    19.6
Name: MEDV, Length: 102, dtype: float64

### Creating a model

In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
lr = LinearRegression() 

In [31]:
lr.fit(X_train, y_train)

LinearRegression()

In [32]:
lr.coef_

array([-1.13418838e-01,  4.76085334e-02, -4.88128847e-03,  2.32530685e+00,
       -1.77465478e+01,  3.24645387e+00,  1.48094204e-02, -1.41432232e+00,
        3.27820337e-01, -1.24699213e-02, -9.10261201e-01,  8.59929825e-03,
       -5.67236504e-01])

### Let's evaluate the quality of the model

In [None]:
lr.predict(X_test)

array([ 0.39899112, 19.44100544, 27.26735339, 22.33877062,  2.83232403,
       18.13114053,  2.12777869, 38.87230242, 16.12977417, 10.44832866,
       -4.63413773, 13.23994787, 20.5016005 , 24.20562149, 13.10565331,
       18.68719105, 19.57493626, 25.13088912,  7.06461479, 20.84767088,
       20.24611447, 21.88773632, 24.2225279 , 20.99181051,  6.81169276,
        8.46252915, 25.15682264, 20.66515352, 23.18100734, 25.2376624 ,
        6.35747084, 23.88976507, 16.48379244, 17.65335157, 21.89812955,
       24.19613449, 28.95347145, 20.41073479, 11.77325734, 35.16485318,
       40.48083224, 25.68961217, 33.58520442, 19.16359751, 16.35443716,
       21.03186572, 22.93813266, 30.85270844, 20.64667529, 12.70129002,
       22.50966657, 13.31730907, 22.31388536, 13.53033627, 30.62158511,
       13.6141422 , 22.82922189, 21.84173041, 34.36415001, 18.17408605,
       16.46869469, 18.82085748, 18.12920185,  6.2614679 , 18.02776338,
       31.84183164, 37.19274391, 16.83069405, 22.57225236, 15.99

In [33]:
pd.DataFrame([y_test.values, lr.predict(X_test)]).T

Unnamed: 0,0,1
0,33.1,32.029150
1,50.0,42.137397
2,50.0,35.076007
3,10.4,7.124037
4,42.8,28.703180
...,...,...
97,12.6,18.296469
98,19.9,19.556993
99,12.5,19.228493
100,22.3,26.369932


In [34]:
## Evaluating with MSE
from sklearn.metrics import mean_squared_error

In [35]:
mean_squared_error(y_test, lr.predict(X_test))

25.727293651528836

In [36]:
mean_squared_error(y_train, lr.predict(X_train))

21.166075581906746