In [1]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.datasets import load_boston
df = load_boston()

df.keys() # Returns all the keys of the dataset dictionary

print(df.DESCR) # Prints the description of the dataset

boston = pd.DataFrame(df.data, columns = df.feature_names) # Converts dataset into pandas dataframe which makes it easier to analyse
boston.head() # Returns the first 5 lines of the dataset

boston['MEDV'] = df.target # Adding the 'target' values in the dataframe by creating a new column
boston.head()

boston.isnull() # Returns whether the value in each cell is null or not ('True' for null and 'False' for not null)

boston.isnull().sum()

from sklearn.model_selection import train_test_split # This will split the dataset into training data and testing data

X = boston.drop('MEDV', axis = 1) # X is a datafram that contains all other columns except the 'target' values column
Y = boston['MEDV'] # Y is a dataframe that contains only the 'target' values column
# We use X to predict Y

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 30)
# This splits the dataframe into test and training dataframes. 
# 'test_size' defines how much of the data is put into the test dataframe and 'random_state' is the randomness state of the data selection.

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
# 'shape' returns how many number of rows and columns each dataframe contains.
# empty value means there is just one column.

from sklearn.linear_model import LinearRegression # Imports the Linear Regression Model
from sklearn.metrics import mean_squared_error # This tells us how much error is there in predicted values from the expected values

# Fitting model on the training dataset
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

# Predicting values on the training dataset
Y_train_predict = lin_model.predict(X_train) # Predicts based on the X_train dataset
rmse = (np.sqrt(mean_squared_error(Y_train, Y_train_predict))) # Finding the Root Mean Squared Error

print("The model performance for training set")
print("RMSE is {}".format(rmse))
print("\n")

# Predicting values on the testing dataset
Y_test_predict = lin_model.predict(X_test) # Predicts based on the X_test dataset
rmse = (np.sqrt(mean_squared_error(Y_test, Y_test_predict))) # Finding the Root Mean Squared Error

print("The model performance for testing set")
print("RMSE is {}".format(rmse))

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu