In [0]:
# import statistical tools
import numpy as np
import pandas as pd
import sklearn
from statsmodels.formula.api import ols
import statsmodels as sm
from sklearn.linear_model import LinearRegression
from sklearn import datasets
from statsmodels.stats.outliers_influence import summary_table

In [0]:
# import data visualisation tools
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
# import dataset and preprocess
url = "abfss://training@sa8451learningdev.dfs.core.windows.net/interpretable_machine_learning/eml_data/Boston.csv"
Boston = spark.read.option("header", "true").csv(url).toPandas().astype(float)
Boston.set_index('SlNo', inplace=True)

In [0]:
# basic exploration of data
Boston.head()

In [0]:
Boston.corr()

In [0]:
# fit model through linear regression
Y = Boston['medv']
X = Boston['lstat']
model = ols("Y~X", data = Boston).fit()

In [0]:
model.summary()

In [0]:
# predict the model
dt = summary_table(model, alpha = 0.5)[1]
Y_prd = dt[:, 2]
Yprd_ci_lower, Yprd_ci_upper = dt[:, 6:8].T
pd.DataFrame(np.column_stack([Y_prd, Yprd_ci_lower, Yprd_ci_upper])).head()

In [0]:
# plot graph with regression line
plt.xkcd()
plt.figure(figsize = (25, 10))
plt.figure(1).add_subplot(121)
print(sns.regplot(X, Y, data = model, color = 'g'))
plt.title("Linear Model")

plt.figure(figsize = (25, 10))
plt.figure(2).add_subplot(122)
print(sns.residplot(X, Y, lowess = True, color = 'r'))
plt.title("Non-Linear Model")