In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
%matplotlib inline

In [2]:
from sklearn.datasets import load_boston

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [None]:
boston = load_boston()

In [None]:
boston.keys()

In [None]:
# Description of the boston dataset
print(boston.DESCR)

## Preparing the Dataset

In [None]:
df = pd.DataFrame(boston.data, columns = boston.feature_names)

In [None]:
df.sample(10)

In [None]:
df['Price'] = boston.target

In [None]:
df.sample(5)

In [None]:
df.info()
# Since there is no null values we are not handling it right now

In [None]:
## Summarizing the stats of the dataset
df.describe()

In [None]:
## Check the missing values
df.isnull().sum()

## Exploratory Data Analysis

In [None]:
## Correlation
df.corr()

In [None]:
# sns.pairplot(dataset)

In [None]:
plt.scatter(df['CRIM'], df['Price'])
plt.xlabel("Crime Rate")
plt.ylabel("House Price")

In [None]:
plt.scatter(df['RM'], df['Price'])
plt.xlabel("Average No. of Rooms")
plt.ylabel("House Price")

In [None]:
sns.regplot(x = "RM", y = "Price", data = df)

In [None]:
sns.regplot(x = "LSTAT", y = "Price", data = df)

In [None]:
## Independent and dependen features

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
X_train

In [None]:
# We are using the gradient descent algorithm internally and for it to converge faster we need the data to be on the same scale
## Standardizing the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import pickle
pickle.dump(scaler, open('scaling.pkl', 'wb'))

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression = LinearRegression()

In [None]:
regression.fit(X_train, y_train)

In [None]:
## printing the coefficient and the intercept
print(regression.coef_)

In [None]:
print(regression.intercept_)

In [None]:
## on which parameter the model has been trained 
regression.get_params()

In [None]:
reg_pred = regression.predict(X_test)

In [None]:
## Plot a scatter plot for the prediction
plt.scatter(reg_pred, y_test)

In [None]:
## Calculating the errors (Residuals)
residuals = y_test - reg_pred

In [None]:
## Plot this residuals
sns.displot(residuals, kind = 'kde')

In [None]:
## Since the above plot in not normally distributed we can tell that there are some outliers in our model 

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print("Mean Absolute Error =", mean_absolute_error(y_test, reg_pred))
print("Mean Squared Error =", mean_squared_error(y_test, reg_pred))
print("Root Mean Squared Error =", np.sqrt(mean_squared_error(y_test, reg_pred)))

## R Squared and Adjusted R Squared

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, reg_pred)
print(score)

In [None]:
# display adjusted R-squared
1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1] - 1)

## New Data Prediction

In [None]:
boston.data[0].reshape(1, -1)

In [None]:
## Transformation of new data 
scaler.transform(boston.data[0].reshape(1, -1))

In [None]:
regression.predict(scaler.transform(boston.data[0].reshape(1, -1)))

## Pickeling the model

In [None]:
import pickle

In [None]:
pickle.dump(regression, open('regmodel.pkl', 'wb'))

In [None]:
pickle_model = pickle.load(open('regmodel.pkl', 'rb'))

In [None]:
pickle_model.predict(scaler.transform(boston.data[0].reshape(1, -1)))