#Project 3

Avik Bhattacharya

Some of the comments are similar to the comments of Project 2.

##Imports/Setups

In [1]:
#Graphical libraries
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120
from IPython.display import Image
from IPython.display import display
plt.style.use('seaborn-white')

In [2]:
!pip install --upgrade --q scipy

In [3]:
#Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import scipy.stats as stats 
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy.interpolate import interp1d, RegularGridInterpolator, griddata, LinearNDInterpolator, NearestNDInterpolator
from math import ceil
from scipy import linalg

#Used in scikit compliant functions
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [4]:
#Will be used later.
scale = StandardScaler()

#Will be used in Question 3.
lm = LinearRegression()

In [5]:
#Datasets used
cars = pd.read_csv('drive/MyDrive/ADV Applied Machine Learning/Module 1/Day 3/cars.csv')
concrete = pd.read_csv('drive/MyDrive/ADV Applied Machine Learning/Module 1/Day 3/concrete.csv')
housing = pd.read_csv('drive/MyDrive/ADV Applied Machine Learning/Homework/housing.csv')

In [9]:
#Euclidean distance between all the observations in u and v.
#Used in the lowess
def dist(u,v):
  if len(v.shape)==1: #If v is one dimensional, it is forced into a row vector.
    v = v.reshape(1,-1)
  d = np.array([np.sqrt(np.sum((u-v[i])**2,axis=1)) for i in range(len(v))])
  return d

In [12]:
#Lowess from class with an input to pick the kernel for Question 2
def lw_ag_md(x, y, xnew,f=2/3,iter=3, intercept=True, kernel='Tricubic'):

  n = len(x) #Number of observations
  r = int(ceil(f * n)) #Calculating the size of the neighborhood
  yest = np.zeros(n)

  if len(y.shape)==1: #Here we make column vector
    y = y.reshape(-1,1)

  if len(x.shape)==1: #Also making column vectors.
    x = x.reshape(-1,1)
  
  if intercept:
    x1 = np.column_stack([np.ones((len(x),1)),x])
  else:
    x1 = x

  #Finds the difference between each value of x, sorts them, and only gives you the values in the neighberhood.
  h = [np.sort(np.sqrt(np.sum((x-x[i])**2,axis=1)))[r] for i in range(n)]
  
  #Calculates the Euclidean distance and makes sure the weights don't go above 1 or under 0.
  w = np.clip(dist(x,x) / np.array(h), 0.0, 1.0)
  
  #Creates the weight using a kernel of your choice. Default is tricubic.
  #Kernels are defined later on in Question 2.
  if(kernel=='Tricubic'):
    w = Tricubic(w)
  elif(kernel=='Epanechnikov'):
    w = Epanechnikov(w)
  elif(kernel=='Quartic'):
    w = Quartic(w)
  elif(kernel=='Gaussian'):
    w = Gaussian(w)
  else:
    w = Tricubic(w)

  #Looping through all X-points
  delta = np.ones(n)
  for iteration in range(iter):
    for i in range(n):
      #The multiplication of two diagonal matrices will create another diagonal matrix.
      W = np.diag(delta).dot(np.diag(w[i,:]))
      b = np.transpose(x1).dot(W).dot(y)
      A = np.transpose(x1).dot(W).dot(x1)
      ##
      A = A + 0.0001*np.eye(x1.shape[1]) #If we want L2 regularization for solving the system
      beta = linalg.solve(A, b)
      #beta, res, rnk, s = linalg.lstsq(A, b)
      yest[i] = np.dot(x1[i],beta.ravel())
      #The .ravel() method used above and below returns a flattened array.

    residuals = y.ravel() - yest
    s = np.median(np.abs(residuals))
    delta = np.clip(residuals / (6.0 * s), -1, 1) #Clips the very high and low outliers.
    delta = (1 - delta ** 2) ** 2 #Very low residuals get centered at a weight of 1.
    
  #Here we are making predictions for xnew by using an interpolation and the predictions we made for the train data
  if x.shape[1]==1:
    f = interp1d(x.flatten(),yest,fill_value='extrapolate')
    output = f(xnew)
  else:
    output = np.zeros(len(xnew))
    for i in range(len(xnew)):
      ind = np.argsort(np.sqrt(np.sum((x-xnew[i])**2,axis=1)))[:r]
      #Has Delauney triangulation work
      #Also prevents code from running too long.
      pca = PCA(n_components=3)
      x_pca = pca.fit_transform(x[ind])
      tri = Delaunay(x_pca,qhull_options='QJ Pp')
      f = LinearNDInterpolator(tri,yest[ind])
      output[i] = f(pca.transform(xnew[i].reshape(1,-1))) 
      #The output may have NaN's where the data points from xnew are outside the convex hull of X

  if sum(np.isnan(output))>0:
    g = NearestNDInterpolator(x,yest.ravel()) 
    # output[np.isnan(output)] = g(X[np.isnan(output)])
    output[np.isnan(output)] = g(xnew[np.isnan(output)])
  return output

In [13]:
#From Project 2 instead of the class example because it has the kernel variable included.
class Lowess_AG_MD:
    def __init__(self, f = 1/10, iter = 3, intercept=True, kernel='Tricubic'):
        self.f = f
        self.iter = iter
        self.intercept = intercept
        self.kernel= kernel
    
    def fit(self, x, y):
        f = self.f
        iter = self.iter
        kernel = self.kernel
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        f = self.f
        iter = self.iter
        intercept = self.intercept
        kernel = self.kernel
        return lw_ag_md(x, y, x_new, f, iter, intercept, kernel) #Version of lowess from above.

    def get_params(self, deep=True):
    #Suppose this estimator has parameters "f", "iter" , "intercept, and "kernel"
        return {"f": self.f, "iter": self.iter,"intercept":self.intercept,"kernel":self.kernel}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

##Question 1

Implement the Gradient Boosting algorithm with user defined choices for Regressor_1 and Regressor_2

In [16]:
#Added f1/f2, iter1/iter2, and kernel1/kernel2 to allow user defined regressors and allow the regressors to be different. 
def boosted_lwr(x, y, xnew, f1=1/3, f2=1/3, iter1=2, iter2=2, intercept=True, kernel1='Epanechnikov', kernel2='Epanechnikov'):
  #We need decision trees
  #For training the boosted method we use x and y
  model1 = Lowess_AG_MD(f=f1, iter=iter1, kernel=kernel1) #We need this for training the Decision Tree
  model1.fit(x,y)
  residuals1 = y - model1.predict(x)
  model2 = Lowess_AG_MD(f=f2, iter=iter2, kernel=kernel2)
  #model2 = RandomForestRegressor(n_estimators=200,max_depth=9)
  model2.fit(x, residuals1)
  output = model1.predict(xnew) + model2.predict(xnew)
  return output

Doing a quick test of the boosted on the concrete dataset and seeing if varying the regressors changes the accuracy.

In [18]:
#I use this structure throughout the project.
x = concrete.loc[:,'cement':'age'].values
y = concrete['strength'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function with identical regressors
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=25/len(xtrain), f2=25/len(xtrain), iter1=1, iter2=1)

mse(ytest,yhat) #To test accuracy.

57.73151261936397

In [19]:
x = concrete.loc[:,'cement':'age'].values
y = concrete['strength'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function with different regressors
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=25/len(xtrain), f2=343/len(xtrain), iter1=1, iter2=3)

mse(ytest,yhat) #To test accuracy.

59.80109374389222

Using different regressors seem to hurt accuracy.

##Question 2

Test the Boosted Locally Weighted Regressor with different choices of data (such as "cars.csv", "concrete.csv" and "housing.csv") and different choice of kernels, such as Gaussian, Tricubic, Epanechnikov and Quartic.

Quartic seems to be the best performing kernel across all datasets.

In [7]:
#Kernels as coded in class with my descriptions.

def Gaussian(w):
  #Gaussian kernels are flatter in comparison to the other kernels and don't peak nearly as high.
  #They also don't hit 0 as the inputs approach 1 to -1.
  return np.where(w>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*w**2))

def Tricubic(w):
  #Tricubic kernels output flattens to 0 as the input approaches -1 and 1.
  #Also flattens to 70/81 for output as input is near 0.
  return np.where(w>1,0,70/81*(1-w**3)**3)

def Quartic(w):
  #Quartic kernels output slightly flattens to 0 as the input approaches -1 and 1.
  #Looks like a negative quadratic graph.
  return np.where(w>1,0,15/16*(1-w**2)**2)

def Epanechnikov(w):
  #Epanechnikov doesn't flatten out to 0 but just the curve continues to 0 as the input approaches -1 and 1.
  #Not as high peak as tricubic and quartic kernels.
  return np.where(w>1,0,3/4*(1-w**2))

###Cars Dataset

For the cars dataset it seems that Quartic kernel seems to do the best followed by Gaussian, Tricubic, and finally Epanechnikov.

In [25]:
x = cars.loc[:,'CYL':'WGT'].values
y = cars['MPG'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the cars dataset using a Gaussian kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Gaussian', kernel2='Gaussian')

mse(ytest,yhat) #To test accuracy.

17.171564080962355

In [26]:
x = cars.loc[:,'CYL':'WGT'].values
y = cars['MPG'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the cars dataset using a Tricubic kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Tricubic', kernel2='Tricubic')

mse(ytest,yhat) #To test accuracy.

17.19057553846594

In [27]:
x = cars.loc[:,'CYL':'WGT'].values
y = cars['MPG'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the cars dataset using a Epanechnikov kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Epanechnikov', kernel2='Epanechnikov')

mse(ytest,yhat) #To test accuracy.

17.49105150231541

In [28]:
x = cars.loc[:,'CYL':'WGT'].values
y = cars['MPG'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the cars dataset using a Quartic kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Quartic', kernel2='Quartic')

mse(ytest,yhat) #To test accuracy.

17.13904731796445

###Concrete Dataset

For the concrete dataset it seems that Quartic kernel seems to do the best followed by Epanechnikov, Tricubic, and finally Gaussian.

In [21]:
x = concrete.loc[:,'cement':'age'].values
y = concrete['strength'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the concrete dataset using a Gaussian kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=25/len(xtrain), f2=25/len(xtrain), iter1=1, iter2=1, kernel1='Gaussian', kernel2='Gaussian')

mse(ytest,yhat) #To test accuracy.

110.19334505529065

In [22]:
x = concrete.loc[:,'cement':'age'].values
y = concrete['strength'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the concrete dataset using a Tricubic kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=25/len(xtrain), f2=25/len(xtrain), iter1=1, iter2=1, kernel1='Tricubic', kernel2='Tricubic')

mse(ytest,yhat) #To test accuracy.

58.071638124882085

In [23]:
x = concrete.loc[:,'cement':'age'].values
y = concrete['strength'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the concrete dataset using a Epanechnikov kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=25/len(xtrain), f2=25/len(xtrain), iter1=1, iter2=1, kernel1='Epanechnikov', kernel2='Epanechnikov')

mse(ytest,yhat) #To test accuracy.

57.73151261936397

In [24]:
x = concrete.loc[:,'cement':'age'].values
y = concrete['strength'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the concrete dataset using a Quartic kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=25/len(xtrain), f2=25/len(xtrain), iter1=1, iter2=1, kernel1='Quartic', kernel2='Quartic')

mse(ytest,yhat) #To test accuracy.

57.609741816889

###Housing Dataset

For the housing dataset it seems that Quartic kernel seems to do the best followed by Tricubic, Epanechnikov, and finally Gaussian.

In [30]:
x = housing.drop('river',axis=1).loc[:,'crime':'lstat'].values
y = housing['cmedv'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the housing dataset using a Gaussian kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Gaussian', kernel2='Gaussian')

mse(ytest,yhat) #To test accuracy.

29.700252735630144

In [31]:
x = housing.drop('river',axis=1).loc[:,'crime':'lstat'].values
y = housing['cmedv'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the housing dataset using a Tricubic kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Tricubic', kernel2='Tricubic')

mse(ytest,yhat) #To test accuracy.

19.775496874475312

In [32]:
x = housing.drop('river',axis=1).loc[:,'crime':'lstat'].values
y = housing['cmedv'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the housing dataset using a Epanechnikov kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Epanechnikov', kernel2='Epanechnikov')

mse(ytest,yhat) #To test accuracy.

20.222983884293082

In [33]:
x = housing.drop('river',axis=1).loc[:,'crime':'lstat'].values
y = housing['cmedv'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123) #Creating test and train data.
#Scaling Data
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

#Testing the boosted function on the housing dataset using a Quartic kernel.
yhat = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Quartic', kernel2='Quartic')

mse(ytest,yhat) #To test accuracy.

19.45746532573222

##Question 3

Use the complete K-Fold crossvalidations to compare with other regressors, such as RandomForest.

In [37]:
#Testing using cars dataset.

x = cars.loc[:,'CYL':'WGT'].values
y = cars['MPG'].values

mse_lwr = []
mse_rf = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)
model_rf = RandomForestRegressor(n_estimators=200,max_depth=5)

#Test Train Split
for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]
  xtrain = scale.fit_transform(xtrain)
  xtest = scale.transform(xtest)

  #Locally Weighted Regression
  yhat_lw = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Quartic', kernel2='Quartic')
  
  #Random Forest
  model_rf.fit(xtrain,ytrain)
  yhat_rf = model_rf.predict(xtest)

  mse_lwr.append(mse(ytest,yhat_lw)) #To test accuracy.
  mse_rf.append(mse(ytest,yhat_rf)) #To test accuracy.
print('The Cross-validated Mean Squared Error for Locally Weighted Regression using a Quartic kernel is : '+str(np.mean(mse_lwr)))
print('The Cross-validated Mean Squared Error for Random Forest is : '+str(np.mean(mse_rf)))

The Cross-validated Mean Squared Error for Locally Weighted Regression using a Quartic kernel is : 15.99975674599304
The Cross-validated Mean Squared Error for Random Forest is : 17.17940703572381


The Locally Weighted Regression seems to be more accurate then the Random Forest for the cars dataset.

In [38]:
#Testing using housing dataset.

x = housing.drop('river',axis=1).loc[:,'crime':'lstat'].values
y = housing['cmedv'].values

mse_lwr = []
mse_rf = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)
model_rf = RandomForestRegressor(n_estimators=200,max_depth=5)

#Test Train Split
for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]
  xtrain = scale.fit_transform(xtrain)
  xtest = scale.transform(xtest)

  #Locally Weighted Regression
  yhat_lw = boosted_lwr(xtrain, ytrain, xtest, f1=1/3, f2=1/3, iter1=1, iter2=1, kernel1='Quartic', kernel2='Quartic')
  
  #Random Forest
  model_rf.fit(xtrain,ytrain)
  yhat_rf = model_rf.predict(xtest)

  mse_lwr.append(mse(ytest,yhat_lw)) #To test accuracy.
  mse_rf.append(mse(ytest,yhat_rf)) #To test accuracy.
print('The Cross-validated Mean Squared Error for Locally Weighted Regression using a Quartic kernel is : '+str(np.mean(mse_lwr)))
print('The Cross-validated Mean Squared Error for Random Forest is : '+str(np.mean(mse_rf)))

The Cross-validated Mean Squared Error for Locally Weighted Regression using a Quartic kernel is : 17.666857991099906
The Cross-validated Mean Squared Error for Random Forest is : 14.244387713106562


The Locally Weighted Regression seems to be less accurate then the Random Forest for the housing dataset.