# Simple Regression

## Boston Housing Dataset

In [None]:
#Boston Housing Dataset: Load the boston dataset.
from sklearn.datasets import load_boston
boston = load_boston()

In [None]:
print(boston['DESCR'])

In [None]:
#Creating feature and target arrays
X, y = boston.data, boston.target
columns = boston.feature_names

import pandas as pd

df_boston = pd.DataFrame(X,columns=boston.feature_names)
df_boston['target'] = y
df_boston.head()

In [None]:
df_boston.to_csv('boston.csv', index=False) 

In [None]:
# visualize the relationship between the features and the response using scatterplots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#sns.pairplot(df_boston, x_vars=boston.feature_names, y_vars='target')

fig, axes = plt.subplots(3, 5,figsize=[15,8],constrained_layout=True)
axes = axes.flatten()
i=0
for x in df_boston.columns[:-1]:
    plt.sca(axes[i]) # set the current Axes
    plt.scatter(df_boston[x],df_boston.target)
    plt.xlabel(x)
    plt.ylabel("target")
    i+=1
    
plt.show()


In [None]:
#Predicting house value from a single feature
X_rooms = df_boston[['RM']]
y = df_boston[['target']]

In [None]:
#Let's plot house value as a function of number of rooms using matplotlib's plt dot scatter
%matplotlib inline
plt.scatter(X_rooms,y)
plt.ylabel('Value of house /1000 ($)') #labeling the y label
plt.xlabel('Number of rooms') #labeling the x label
plt.show()
#more rooms lead to higher prices

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_rooms,y,
                                                    test_size=0.3,
                                                    random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression #Ordinary Least Squares

# Create linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)


In [None]:
#DEFINE YOUR REGRESSOR and THE PARAMETERS GRID
from sklearn.linear_model import LinearRegression #Ordinary Least Squares
import numpy as np

regressor = LinearRegression()
# fit_intercept : boolean, optional, default True
#    whether to calculate the intercept for this model. 

#normalize : boolean, optional, default False
#    This parameter is ignored when fit_intercept is set to False. 
#    If True, the regressors X will be normalized before regression.

#copy_X : boolean, optional, default True
#    If True, X will be copied; else, it may be overwritten.


# Create linear regression object
# Train the model using the training sets
# regressor.fit(X_train, y_train)

#DEFINE YOUR GRIDSEARCH 

from sklearn.model_selection import GridSearchCV
parameters = {} #'normalize':[True,False]

gs = GridSearchCV(regressor, parameters, cv=3) #with no params it reduces to a CV

gs = gs.fit(X_train,y_train)


In [None]:
#summarize the results of your GRIDSEARCH
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
#Returns the coefficient of determination R^2 of the prediction.
#Explained variance score: 1 is perfect prediction
gs.score(X_test, y_test)

In [None]:
plt.scatter(X_train,y_train, color='blue')
plt.scatter(X_test, y_test, color='green')
plt.plot(X_test, gs.predict(X_test), color='black',linewidth=3)
plt.show()

In [None]:
# Estimated coefficients for the linear regression problem. 
print('Coefficients: ', gs.best_estimator_.coef_)

#Independent term in the linear model.
print('Intercept: ', gs.best_estimator_.intercept_)

In [None]:
error_train=gs.predict(X_train)-y_train
error_test=gs.predict(X_test)-y_test

error_train.describe()


In [None]:
error_test.describe()


In [None]:
plt.scatter(gs.predict(X_train),error_train, c="b", label="training data")
plt.scatter(gs.predict(X_test),error_test, c="g", label="test data")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.hlines(y=0, xmin=-1, xmax=50, color="r")
plt.show()

## Normality Test

In [None]:
nb_error_train = np.array(error_train).flatten()

In [None]:
# Scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(error_train)

scaled_error_train = scaler.transform(error_train).flatten()

In [None]:
plt.hist(nb_error_train, bins='auto')  
plt.show()

In [None]:
import numpy as np
import scipy 
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples
from matplotlib import pyplot as plt

# We test a exponential distribution
dist = getattr(scipy.stats, 'norm')
param = dist.fit(nb_error_train)
    
err_mean=param[-2]
err_std=param[-1]   

# We generate a sample of size  len(mr_scaled) of data distributed according to distribution dist
# The function rvs generates a sample with distribution dist with mean loc and std scale
test_dist = dist.rvs(*param[0:-2],loc=param[-2], scale=param[-1],size = len(error_train))
test_dist.sort()

# qq-plot using statsmodels
qqplot_2samples(test_dist,np.array(error_train).flatten(),  line='45')
plt.show()

# We create the percentiles for both distributions
percs = np.linspace(0,100,21)
q_a = np.percentile(error_train, percs)
q_b = np.percentile(test_dist, percs)

# and generate the QQ-plot 
plt.plot(q_a,q_b, ls="", marker="o")
plt.title("QQ plot")
x = np.linspace(np.min((q_a.min(),q_b.min())), np.max((q_a.max(),q_b.max())))
plt.plot(x,x, color="k", ls="--")
plt.show()

plt.hist(error_train,alpha=.3, density=True,bins='auto')
plt.hist(test_dist,alpha=.3, density=True,bins='auto')
plt.show()

In [None]:
# Kolmogorov-Smirnov Test
#    Test the distribution G(x) against a given distribution F(x).
#    Under the null hypothesis the two distributions are identical, G(x)=F(x).

from scipy import stats
stats.kstest(scaled_error_train,"norm")
#stats.kstest(nb_error_train,test_dist)

In [None]:
# D’Agostino Test
#    Under the null hypothesis that the distributions follows a normal distribution.

stats.normaltest(scaled_error_train)

In [None]:
# Shapiro test for normality
from scipy import stats
stats.shapiro(scaled_error_train)