# Import Libraries

In [1]:
from pandas import read_csv
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Define Model functions

## Linear Regression

In [2]:
def LR(X,y):
    
    # define model
    model = Ridge()
    
    # define evaluation
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    # define search space
    space = dict()
    space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
    space['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
    space['fit_intercept'] = [True, False]
    space['normalize'] = [True, False]
    
    # define search
    search = GridSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
    
    # execute search
    result = search.fit(X, y)
    
    # summarize result
    print('Best Score: %s' % result.best_score_)
    print('Best Hyperparameters: %s' % result.best_params_)

## Decision Tree

In [3]:
def DT(X,y):
    
    # define model
    model = DecisionTreeRegressor()
    
    # define evaluation
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    # define search space
    space = {  'max_depth': [int(x) for x in np.linspace(5, 30, num = 6)],
               'min_samples_split': [2, 5, 10, 15, 100],
               'min_samples_leaf': [1, 2, 5, 10]}
     
    # define search    
    search = GridSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
    
    # execute search
    result = search.fit(X, y)
    
    # summarize result
    print('Best Score: %s' % result.best_score_)
    print('Best Hyperparameters: %s' % result.best_params_)

# Call Model functions

## Scaled Dataset

In [5]:
# load dataset
url = 'scaled_data.csv'
dataframe = read_csv(url, header=0)

# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]

In [6]:
LR(X,y)

Best Score: -1037.2325191357693
Best Hyperparameters: {'alpha': 0.1, 'fit_intercept': True, 'normalize': True, 'solver': 'lsqr'}


In [7]:
DT(X,y)

Best Score: -1063.5192792592593
Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 100}


## Unscaled Dataset

In [11]:
# load dataset
url = 'unscaled_data.csv'
dataframe = read_csv(url, header=0)

# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]

In [12]:
LR(X,y)

Best Score: -1037.2325191357693
Best Hyperparameters: {'alpha': 0.1, 'fit_intercept': True, 'normalize': True, 'solver': 'lsqr'}


In [13]:
DT(X,y)

Best Score: -1063.5192792592593
Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 100}


* Linear Regression performs better than Decision Tree