# Training LinearRegressionGD on the Breast Cancer dataset

The breast cancer dataset is imported from sklearn. It contains 569 samples. A summary of the data can be found below.

We want to compare model performance between the Linear Regression model and the Linear Regression GD model.

In [12]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np

cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

sc_y = StandardScaler()
sc.fit(y_train[:, np.newaxis])
y_train_std = sc.transform(y_train[:, np.newaxis]).flatten()
y_test_std = sc.transform(y_test[:, np.newaxis]).flatten()

##################

class LinearRegressionGD(object):

    def __init__(self, eta=0.001, n_iter=20):
        self.eta = eta
        self.n_iter = n_iter

    def fit(self, X, y):
        self.w_ = np.zeros(1 + X.shape[1])
        self.cost_ = []

        for i in range(self.n_iter):
            output = self.net_input(X)
            errors = (y - output)
            self.w_[1:] += self.eta * X.T.dot(errors)
            self.w_[0] += self.eta * errors.sum()
            cost = (errors**2).sum() / 2.0
            self.cost_.append(cost)
        return self

    def net_input(self, X):
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def predict(self, X):
        return self.net_input(X)
    
est = LinearRegressionGD()
est.fit(X_train_std, y_train_std)

##################

y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train_std, y_train_pred),
        mean_squared_error(y_test_std, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train_std, y_train_pred),
        r2_score(y_test_std, y_test_pred)))

MSE train: 8633007522077303898112.000, test: 7065331788890949812224.000
R^2 train: -8633007522077300752384.000, test: -7359732204517148590080.000


In [13]:
cancer

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [7]:
X_train_std

array([[ 2.45523767,  0.56480743,  2.3364474 , ...,  1.7504593 ,
         1.23570672,  0.4469218 ],
       [ 0.44175802,  0.98434205,  0.43397348, ..., -0.0638393 ,
        -0.10977641, -1.25478688],
       [ 0.1839675 ,  0.67243673,  0.37108178, ...,  1.94120463,
        -0.01145264,  1.83352098],
       ...,
       [ 0.80809192,  1.10954347,  0.84670026, ...,  1.08950455,
         0.75961269,  2.91322579],
       [-0.05211435,  0.39128265, -0.11279124, ..., -0.54099836,
        -1.1154388 , -0.47434807],
       [-0.32347279, -0.23033147, -0.39501774, ..., -0.97719892,
        -1.52770864, -1.32402881]])

In [9]:
y_train_std

array([-1.25830574, -1.25830574, -1.25830574,  0.79471941,  0.79471941,
       -1.25830574,  0.79471941, -1.25830574, -1.25830574, -1.25830574,
        0.79471941, -1.25830574,  0.79471941,  0.79471941,  0.79471941,
        0.79471941,  0.79471941, -1.25830574,  0.79471941,  0.79471941,
       -1.25830574, -1.25830574, -1.25830574, -1.25830574,  0.79471941,
       -1.25830574,  0.79471941,  0.79471941, -1.25830574,  0.79471941,
       -1.25830574,  0.79471941,  0.79471941,  0.79471941, -1.25830574,
        0.79471941,  0.79471941,  0.79471941, -1.25830574,  0.79471941,
        0.79471941,  0.79471941,  0.79471941, -1.25830574,  0.79471941,
        0.79471941, -1.25830574, -1.25830574, -1.25830574,  0.79471941,
        0.79471941, -1.25830574,  0.79471941,  0.79471941,  0.79471941,
        0.79471941,  0.79471941, -1.25830574,  0.79471941,  0.79471941,
        0.79471941, -1.25830574,  0.79471941, -1.25830574, -1.25830574,
       -1.25830574, -1.25830574,  0.79471941, -1.25830574, -1.25

# Training sklearn's LinearRegression on the Breast Cancer dataset

In [16]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np

cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

sc_y = StandardScaler()
sc.fit(y_train[:, np.newaxis])
y_train_std = sc.transform(y_train[:, np.newaxis]).flatten()
y_test_std = sc.transform(y_test[:, np.newaxis]).flatten()

##################

from sklearn.linear_model import LinearRegression

est = LinearRegression()
est.fit(X_train_std, y_train_std)

##################

y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train_std, y_train_pred),
        mean_squared_error(y_test_std, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train_std, y_train_pred),
        r2_score(y_test_std, y_test_pred)))

MSE train: 0.209, test: 0.277
R^2 train: 0.791, test: 0.712


Using Linear Regression gives higher $R^2$ values for both the training and testing sets than using Linear Regression GD model.

The MSE is significantly smaller when using Linear Regression than when using Linear Regression GD model.

This suggests that fitting the data with Linear Regression is better than using Linear Regression GD.