In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import math

In [153]:
# This class manipulates data from a csv file
class ManipulateData:

    #initialize file
    def __init__(self, file):
        self.file = file

    #Reads the supplied file and shuffles the data
    def filter_data(self):
        data = pd.read_csv(self.file).dropna()
        data = data.sample(frac = 1, random_state=30) #shuffling and ensuring everytime output is same shuffled data
        return data

    #Filter with respect to column passed as an argument 
    def seperate_columns(self, data, columns):
        data = data[columns]
        return data

    #splits data to train and test data and returns train and test data as numpy array
    #only training percentage is passed as an argument 
    def split_data(self, data, percent):
        data_array = data.to_numpy()
        train_size = math.floor(len(data_array)*percent) #takes integer as training size
        #training and testing data are seperated as two numpy arrays inside tuple
        train_data, test_data = data_array[:train_size,:], data_array[train_size:,:] 

        
        train_length = train_data.shape[0]
        test_length = test_data.shape[0]
        print(f"total training rows: {train_length}\ntotal testing rows: {test_length}")
        
        return train_data, test_data

    #seperate features and target
    def seperate_target(self, data):
        rows = data.shape[0]
        features = []
        target = []
        for row in range(rows):
            features.append(data[row, :-1])
            target.append(data[row, -1])
        return np.array(features), np.array(target)

    # scale features using mean normalization and returns numpy array
    def scale_features(self, data):
        # calculate mean, minimum, and maximum for each column of the features
        feature_mean = np.mean(data, axis=0) #axis 0 refers to the column and axis 1 for row
        feature_max = np.max(data, axis=0)
        feature_min = np.min(data, axis=0)

        scaled_data = []

        # normalize and append each rows of features to new array 
        for row in data:
            scaled_row = []
            for i in range(len(row)):
                scaled_value = (row[i] - feature_mean[i]) / (feature_max[i] - feature_min[i])
                scaled_row.append(scaled_value)
            scaled_data.append(scaled_row)

        return np.array(scaled_data)
    

In [3]:
class MSE:
    # calculating cost using vectorization('np.dot()')
    def calculate_cost(self, X_train, Y_train, th0, thj):
            
            m = X_train.shape[0]

            # use vectorization to find dot product of two matrices (features and weights)
            y_predict = th0 + np.dot(X_train, thj)
        
            sq_difference = (y_predict - Y_train) ** 2
            total_cost = np.sum(sq_difference) / (2*m)
        
            return total_cost

In [108]:
# This class has all the methods required for gradient descent algorithm to work
# Pass hyperparameters while creating object
class LinearRegression(MSE):
    def __init__(self, alpha, iterations):
        super().__init__()
        self.alpha = alpha
        self.iterations = iterations

    #this method computes gradient using vectorization and utilizes the broadcasting power of numpy
    def calculate_gradient(self, X_train, Y_train, th0, thj):
        m = X_train.shape[0]
        
        y_predict = th0 + np.dot(X_train, thj) #get numpy array containing predicted y of each row

        difference = y_predict - Y_train # subtract actual y (broadcasting is used to do matrix manipulation)
        
        gradient_th0 = np.sum(difference) / m #difference is an array of differences so we need to sum all and divide my number of rows

        #vectorization used to find dot product and gets matrix with the same shape of weight
        #broadcasting is utilized on difference and m
        gradient_thj = np.dot(difference, X_train) / m 

        return gradient_th0, gradient_thj

    #same as the calculate_gradient function but uses regularization 
    def calculate_regularized_gradient(self, X_train, Y_train, th0, thj, lam):
        m = X_train.shape[0]
        
        y_predict = th0 + np.dot(X_train, thj)

        difference = y_predict - Y_train
        
        gradient_th0 = np.sum(difference) / m
        gradient_thj = np.dot(difference, X_train) / m

        # regularization
        gradient_thj += (lam / m) * thj

        return gradient_th0, gradient_thj

    # utilizes gradient descent formula  to get optimal weights and bias
    def gradient_descent(self, X_train, Y_train, th0, thj):
        
        for i in range(self.iterations):
            gradients = self.calculate_gradient(X_train, Y_train, th0, thj)
            th0 = th0 - (self.alpha * gradients[0])
            thj = thj - (self.alpha * gradients[1])

            # check cost over iteration
            # if i%10 == 0 or i == self.iterations - 1:
            #     current_cost = self.calculate_cost(X_train, Y_train, th0, thj)
            #     print(f"Iteration: {i}\t Cost: {current_cost:.3f}\t Bias: {th0:.3f}\t Weight: {np.round(thj, 3)}")
            
        return th0, thj

    #same as gradient_descent but calls calls calculate_regularized_gradient method
    def regularized_gradient_descent(self, X_train, Y_train, th0, thj, lam):
        
        for i in range(self.iterations):
            gradients = self.calculate_regularized_gradient(X_train, Y_train, th0, thj, lam)
            th0 = th0 - (self.alpha * gradients[0])
            thj = thj - (self.alpha * gradients[1])

            if i%10 == 0 or i == self.iterations - 1:
                current_cost = self.calculate_cost(X_train, Y_train, th0, thj)
                print(f"Iteration: {i}\t Cost: {current_cost:.3f}\t Bias: {th0:.3f}\t Weight: {np.round(thj, 3)}")
            
        return th0, thj
            
        

In [5]:
# Read file and clear out null values
file = 'Cancer_dataset.csv'

df = ManipulateData(file)
data = df.filter_data() #cleans out rows containing null values and shuffles the data

## Question 1

In [19]:
#first seperate columns by passing column names and after required columns are obtained from the data we split 80% of data as training data
#index of the columns defined will be the same index on the array as well
columns = ['mean_texture', 'tumor_size'] #the final column will be the target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)

# now seperate features and target as two seperate arrays with each features index corresponding to target index
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

total training rows: 145
total testing rows: 37


In [79]:
#initialize weight, biases and hyperparameters
bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50

#create object for linear regression class
fit = LinearRegression(learning_rate, iterations)

# compute final weights and biases using gradient descent
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

Iteration: 0	 Cost: 4.822	 Bias: 0.000	 Weight: [0.005]
Iteration: 10	 Cost: 3.245	 Bias: 0.002	 Weight: [0.045]
Iteration: 20	 Cost: 2.416	 Bias: 0.004	 Weight: [0.073]
Iteration: 30	 Cost: 1.980	 Bias: 0.005	 Weight: [0.094]
Iteration: 40	 Cost: 1.751	 Bias: 0.006	 Weight: [0.109]
Iteration: 49	 Cost: 1.639	 Bias: 0.007	 Weight: [0.12]


final bias = 0.0067610733604299865 	 final weights = [0.11953371]


##### Notice here: cost decreased over time and got stabalized

In [80]:
# Check mean squared error for both train and test
mse = MSE()

# calculate mean squared error using final predicted weights and bias
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

cost for train data = 1.638982284944249
cost for test data = 3.5109876086243226


From mean squared error function, we can observe that the cost is low on train data and higher on the test data 
which means it performs well on the training data but struggles to perform well on unseen data. 
So there may be problem of overfitting.

## Question 2

Same pattern of code is used on this section as well

In [24]:
columns = ['mean_texture', 'lymph_node_status', 'tumor_size'] #the final column will be your target column when you use 'seperate_target(data) method'
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

total training rows: 145
total testing rows: 37


In [77]:
bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

Iteration: 0	 Cost: 3.155	 Bias: 0.003	 Weight: [0.047]
Iteration: 10	 Cost: 1.497	 Bias: 0.008	 Weight: [0.147]
Iteration: 20	 Cost: 1.496	 Bias: 0.009	 Weight: [0.149]
Iteration: 30	 Cost: 1.496	 Bias: 0.009	 Weight: [0.149]
Iteration: 40	 Cost: 1.496	 Bias: 0.009	 Weight: [0.149]
Iteration: 49	 Cost: 1.496	 Bias: 0.010	 Weight: [0.149]


final bias = 0.00969690009063465 	 final weights = [0.14947094]


In [78]:
mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

cost for train data = 1.4961777208903009
cost for test data = 2.918009092955125


## Question 3

Same pattern of code is used throughout this with change in column array section. Calculation of BIC is additional part.

### a) Forward Stepwise Regression

For the forward stepwise regression model I have choosen 5 of the random features. These features are: 
'mean_radius', 'mean_smoothness', 'mean_symmetry', 'mean_fractal_dimension', 'lymph_node_status'

BIC= ln(n)k – 2 ln(L) <br>
$k$ is the number of parameters in the model.<br>
$n$ is the number of data points.<br>
$L$ is the likelihood of the model.<br>


In [36]:
# Function to calculate BIC value
def calculate_bic(k, n, mse):
    ln_L = - (n/2) * (np.log(2 * np.pi * mse) + 1) # log likelihood equation
    bic = (np.log(n) * k) - (2 * ln_L) # BIC equation
    return bic

#### Step 1

In [37]:
columns = ['tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

total training rows: 145
total testing rows: 37


Now calculating BIC for bias only. 
To calculate the best bias for the model we need a bias that minimizes the cost. So the best bias would be the average value of the target values.
Then calculate the MSE for the bias.

In [38]:
k_base = 1
n_base = target.shape[0]
bias_base = np.mean(target)
mn_sq_err = np.mean((bias_base - target) ** 2)
bic_base = calculate_bic(k_base, n_base, mn_sq_err)
print(bic_base)

579.6747148035079


#### Step 2

Now adding each of the five features to formulate five separate univariate models, and calculate AIC for each of the five models.

In [40]:
k = 2

In [76]:
columns = ['mean_radius', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")


n = target.shape[0]
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.822	 Bias: 0.000	 Weight: [0.005]
Iteration: 10	 Cost: 3.245	 Bias: 0.002	 Weight: [0.045]
Iteration: 20	 Cost: 2.416	 Bias: 0.004	 Weight: [0.073]
Iteration: 30	 Cost: 1.980	 Bias: 0.005	 Weight: [0.094]
Iteration: 40	 Cost: 1.751	 Bias: 0.006	 Weight: [0.109]
Iteration: 49	 Cost: 1.639	 Bias: 0.007	 Weight: [0.12]


final bias = 0.0067610733604299865 	 final weights = [0.11953371]
cost for train data = 1.638982284944249
cost for test data = 3.5109876086243226
BIC: 512.9935233150557


In [44]:
columns = ['mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")


n = target.shape[0]
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 5.041	 Bias: 0.000	 Weight: [0.]
Iteration: 10	 Cost: 5.034	 Bias: 0.003	 Weight: [0.]
Iteration: 20	 Cost: 5.027	 Bias: 0.006	 Weight: [0.001]
Iteration: 30	 Cost: 5.020	 Bias: 0.008	 Weight: [0.001]
Iteration: 40	 Cost: 5.013	 Bias: 0.011	 Weight: [0.001]
Iteration: 49	 Cost: 5.007	 Bias: 0.013	 Weight: [0.001]


final bias = 0.013198331688692522 	 final weights = [0.0013406]
cost for train data = 5.006975485591109
cost for test data = 8.675776732462515
BIC: 655.0162875239262


In [45]:
columns = ['mean_symmetry', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")


n = target.shape[0]
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 5.041	 Bias: 0.000	 Weight: [0.]
Iteration: 10	 Cost: 5.034	 Bias: 0.003	 Weight: [0.001]
Iteration: 20	 Cost: 5.027	 Bias: 0.006	 Weight: [0.001]
Iteration: 30	 Cost: 5.020	 Bias: 0.008	 Weight: [0.002]
Iteration: 40	 Cost: 5.013	 Bias: 0.011	 Weight: [0.002]
Iteration: 49	 Cost: 5.006	 Bias: 0.013	 Weight: [0.002]


final bias = 0.013197496381459216 	 final weights = [0.00248787]
cost for train data = 5.006101518706459
cost for test data = 8.674570800679586
BIC: 654.9909755846408


In [232]:
columns = ['mean_fractal_dimension', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")


n = target.shape[0]
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")



final bias = 0.014399037919676063 	 final weights = [0.00089368]
cost for train data = 5.921272362800858
cost for test data = 5.0772145374047986
BIC: 679.335588147148


In [233]:
columns = ['lymph_node_status', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")


n = target.shape[0]
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")



final bias = 0.013937334816510107 	 final weights = [0.06105735]
cost for train data = 5.1777714962668915
cost for test data = 4.154766773645429
BIC: 659.8799809551556


Here, while calculating BIC for each univariate model, we can see that the minimum BIC which is less than the base BIC is obtained while adding 'mean_radius' feature. So I am going to go with teh 'mean_radius' feature for the next step.

#### Step 3

In [46]:
k = 3
n = target.shape[0]

Now we are left with 4 features ('mean_smoothness', 'mean_symmetry', 'mean_fractal_dimension', 'lymph_node_status') which we need to calculate the BIC with Bivariate model.

In [48]:
columns = ['mean_radius', 'mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)
# print(features)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.822	 Bias: 0.000	 Weight: [0.005 0.   ]
Iteration: 10	 Cost: 3.245	 Bias: 0.002	 Weight: [0.045 0.   ]
Iteration: 20	 Cost: 2.416	 Bias: 0.004	 Weight: [0.073 0.   ]
Iteration: 30	 Cost: 1.980	 Bias: 0.005	 Weight: [0.094 0.001]
Iteration: 40	 Cost: 1.751	 Bias: 0.006	 Weight: [0.109 0.001]
Iteration: 49	 Cost: 1.639	 Bias: 0.007	 Weight: [0.12  0.001]


final bias = 0.0067609394281968075 	 final weights = [0.11953144 0.00067867]
cost for train data = 1.6389675433288369
cost for test data = 3.5109416088152496
BIC: 498.0620178980737


In [50]:
columns = ['mean_radius', 'mean_symmetry', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.821	 Bias: 0.000	 Weight: [0.005 0.   ]
Iteration: 10	 Cost: 3.245	 Bias: 0.002	 Weight: [0.045 0.   ]
Iteration: 20	 Cost: 2.416	 Bias: 0.004	 Weight: [0.073 0.001]
Iteration: 30	 Cost: 1.980	 Bias: 0.005	 Weight: [0.094 0.001]
Iteration: 40	 Cost: 1.751	 Bias: 0.006	 Weight: [0.109 0.001]
Iteration: 49	 Cost: 1.639	 Bias: 0.007	 Weight: [0.12  0.001]


final bias = 0.006760613049397101 	 final weights = [0.11952585 0.00124699]
cost for train data = 1.6389380653684633
cost for test data = 3.5108316988249233
BIC: 498.05940994972826


In [52]:
columns = ['mean_radius', 'mean_fractal_dimension', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.822	 Bias: 0.000	 Weight: [0.005 0.   ]
Iteration: 10	 Cost: 3.245	 Bias: 0.002	 Weight: [0.045 0.   ]
Iteration: 20	 Cost: 2.416	 Bias: 0.004	 Weight: [0.073 0.   ]
Iteration: 30	 Cost: 1.980	 Bias: 0.005	 Weight: [0.094 0.   ]
Iteration: 40	 Cost: 1.751	 Bias: 0.006	 Weight: [0.109 0.   ]
Iteration: 49	 Cost: 1.639	 Bias: 0.007	 Weight: [0.12 0.  ]


final bias = 0.006761023110009425 	 final weights = [0.11953287 0.00041581]
cost for train data = 1.638976519949763
cost for test data = 3.510970270699966
BIC: 498.0628120605177


In [53]:
columns = ['mean_radius', 'lymph_node_status', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.811	 Bias: 0.000	 Weight: [0.005 0.001]
Iteration: 10	 Cost: 3.168	 Bias: 0.002	 Weight: [0.044 0.01 ]
Iteration: 20	 Cost: 2.311	 Bias: 0.004	 Weight: [0.073 0.018]
Iteration: 30	 Cost: 1.861	 Bias: 0.005	 Weight: [0.093 0.024]
Iteration: 40	 Cost: 1.624	 Bias: 0.006	 Weight: [0.108 0.029]
Iteration: 49	 Cost: 1.506	 Bias: 0.007	 Weight: [0.117 0.033]


final bias = 0.006614204030448652 	 final weights = [0.11702631 0.03309335]
cost for train data = 1.5064580587634933
cost for test data = 3.07321510850442
BIC: 485.8377555141008


So for the bivariate model, BIC value of only model with lymph_node_status is lower than the model obtained from univariate model. So the two of the selected features are 'mean_radius' and 'lymph_node_status'.

Now, continuing same process with three features

#### Step 4

In [246]:
k = 4
n = target.shape[0]

In [55]:
columns = ['mean_radius', 'lymph_node_status', 'mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.811	 Bias: 0.000	 Weight: [0.005 0.001 0.   ]
Iteration: 10	 Cost: 3.168	 Bias: 0.002	 Weight: [0.044 0.01  0.   ]
Iteration: 20	 Cost: 2.311	 Bias: 0.004	 Weight: [0.073 0.018 0.   ]
Iteration: 30	 Cost: 1.861	 Bias: 0.005	 Weight: [0.093 0.024 0.001]
Iteration: 40	 Cost: 1.624	 Bias: 0.006	 Weight: [0.108 0.029 0.001]
Iteration: 49	 Cost: 1.506	 Bias: 0.007	 Weight: [0.117 0.033 0.001]


final bias = 0.0066140734016327096 	 final weights = [0.11702409 0.03309304 0.00066327]
cost for train data = 1.5064462224832875
cost for test data = 3.073178356363084
BIC: 485.83661624084846


In [58]:
columns = ['mean_radius', 'lymph_node_status', 'mean_symmetry', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.811	 Bias: 0.000	 Weight: [0.005 0.001 0.   ]
Iteration: 10	 Cost: 3.168	 Bias: 0.002	 Weight: [0.044 0.01  0.   ]
Iteration: 20	 Cost: 2.311	 Bias: 0.004	 Weight: [0.073 0.018 0.001]
Iteration: 30	 Cost: 1.861	 Bias: 0.005	 Weight: [0.093 0.024 0.001]
Iteration: 40	 Cost: 1.624	 Bias: 0.006	 Weight: [0.108 0.029 0.001]
Iteration: 49	 Cost: 1.506	 Bias: 0.007	 Weight: [0.117 0.033 0.001]


final bias = 0.0066137549448889105 	 final weights = [0.11701864 0.03309229 0.00121891]
cost for train data = 1.5064232262894444
cost for test data = 3.0730900774639256
BIC: 485.8344027708232


In [61]:
columns = ['mean_radius', 'lymph_node_status', 'mean_fractal_dimension', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.811	 Bias: 0.000	 Weight: [0.005 0.001 0.   ]
Iteration: 10	 Cost: 3.168	 Bias: 0.002	 Weight: [0.044 0.01  0.   ]
Iteration: 20	 Cost: 2.311	 Bias: 0.004	 Weight: [0.073 0.018 0.   ]
Iteration: 30	 Cost: 1.861	 Bias: 0.005	 Weight: [0.093 0.024 0.   ]
Iteration: 40	 Cost: 1.624	 Bias: 0.006	 Weight: [0.108 0.029 0.   ]
Iteration: 49	 Cost: 1.506	 Bias: 0.007	 Weight: [0.117 0.033 0.   ]


final bias = 0.006614155009020355 	 final weights = [0.11702549 0.03309323 0.00040645]
cost for train data = 1.5064533798577608
cost for test data = 3.073201238788571
BIC: 485.8373051581279


Here the minimum BIC value less than the previous model is obtained when we adding 'mean_symmetry' feature. 

#### Step 5

Backward Step

In [64]:
k=3

In [63]:
columns = ['mean_symmetry', 'lymph_node_status', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 5.031	 Bias: 0.000	 Weight: [0.    0.001]
Iteration: 10	 Cost: 4.920	 Bias: 0.003	 Weight: [0.001 0.011]
Iteration: 20	 Cost: 4.815	 Bias: 0.005	 Weight: [0.001 0.021]
Iteration: 30	 Cost: 4.717	 Bias: 0.008	 Weight: [0.002 0.031]
Iteration: 40	 Cost: 4.624	 Bias: 0.011	 Weight: [0.002 0.04 ]
Iteration: 49	 Cost: 4.544	 Bias: 0.013	 Weight: [0.002 0.048]


final bias = 0.012896078527013535 	 final weights = [0.00243006 0.04812692]
cost for train data = 4.544319897648481
cost for test data = 7.398693300108156
BIC: 645.9346973045923


#### Step 6

Forward

In [250]:
k = 5
n = target.shape[0]

In [66]:
columns = ['mean_radius', 'lymph_node_status', 'mean_symmetry', 'mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.811	 Bias: 0.000	 Weight: [0.005 0.001 0.    0.   ]
Iteration: 10	 Cost: 3.168	 Bias: 0.002	 Weight: [0.044 0.01  0.    0.   ]
Iteration: 20	 Cost: 2.311	 Bias: 0.004	 Weight: [0.073 0.018 0.001 0.   ]
Iteration: 30	 Cost: 1.861	 Bias: 0.005	 Weight: [0.093 0.024 0.001 0.001]
Iteration: 40	 Cost: 1.624	 Bias: 0.006	 Weight: [0.108 0.029 0.001 0.001]
Iteration: 49	 Cost: 1.506	 Bias: 0.007	 Weight: [0.117 0.033 0.001 0.001]


final bias = 0.006613624329984617 	 final weights = [0.11701643 0.03309197 0.00121888 0.00066322]
cost for train data = 1.5064113955501581
cost for test data = 3.073053335118395
BIC: 485.83326400457037


In [67]:
columns = ['mean_radius', 'lymph_node_status', 'mean_symmetry', 'mean_fractal_dimension', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.811	 Bias: 0.000	 Weight: [0.005 0.001 0.    0.   ]
Iteration: 10	 Cost: 3.168	 Bias: 0.002	 Weight: [0.044 0.01  0.    0.   ]
Iteration: 20	 Cost: 2.311	 Bias: 0.004	 Weight: [0.073 0.018 0.001 0.   ]
Iteration: 30	 Cost: 1.861	 Bias: 0.005	 Weight: [0.093 0.024 0.001 0.   ]
Iteration: 40	 Cost: 1.624	 Bias: 0.006	 Weight: [0.108 0.029 0.001 0.   ]
Iteration: 49	 Cost: 1.506	 Bias: 0.007	 Weight: [0.117 0.033 0.001 0.   ]


final bias = 0.006613705928679848 	 final weights = [0.11701782 0.03309217 0.0012189  0.00040642]
cost for train data = 1.506418549507003
cost for test data = 3.0730762114448127
BIC: 485.8339526088138


Since the BIC after adding 'mean_smoothness' is lower than the previous selected model, we choose this feature.

#### Step 7

In [68]:
k=4

In [70]:
# Remove mean_radius
columns = ['lymph_node_status', 'mean_symmetry', 'mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 5.031	 Bias: 0.000	 Weight: [0.001 0.    0.   ]
Iteration: 10	 Cost: 4.920	 Bias: 0.003	 Weight: [0.011 0.001 0.   ]
Iteration: 20	 Cost: 4.815	 Bias: 0.005	 Weight: [0.021 0.001 0.001]
Iteration: 30	 Cost: 4.717	 Bias: 0.008	 Weight: [0.031 0.002 0.001]
Iteration: 40	 Cost: 4.623	 Bias: 0.011	 Weight: [0.04  0.002 0.001]
Iteration: 49	 Cost: 4.544	 Bias: 0.013	 Weight: [0.048 0.002 0.001]


final bias = 0.01289574849859015 	 final weights = [0.04812612 0.00243    0.00130909]
cost for train data = 4.543994208847065
cost for test data = 7.398279228794519
BIC: 650.9010386077392


In [71]:
# Remove lymph_node_status
columns = ['mean_radius', 'mean_symmetry', 'mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.821	 Bias: 0.000	 Weight: [0.005 0.    0.   ]
Iteration: 10	 Cost: 3.245	 Bias: 0.002	 Weight: [0.045 0.    0.   ]
Iteration: 20	 Cost: 2.416	 Bias: 0.004	 Weight: [0.073 0.001 0.   ]
Iteration: 30	 Cost: 1.980	 Bias: 0.005	 Weight: [0.094 0.001 0.001]
Iteration: 40	 Cost: 1.751	 Bias: 0.006	 Weight: [0.109 0.001 0.001]
Iteration: 49	 Cost: 1.639	 Bias: 0.007	 Weight: [0.12  0.001 0.001]


final bias = 0.006760479131436941 	 final weights = [0.11952358 0.00124696 0.00067862]
cost for train data = 1.6389233300188395
cost for test data = 3.5107857106716214
BIC: 503.03484002159195


#### Step 8

In [72]:
k = 6

In [74]:
columns = ['mean_radius', 'lymph_node_status', 'mean_symmetry', 'mean_fractal_dimension', 'mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 4.811	 Bias: 0.000	 Weight: [0.005 0.001 0.    0.    0.   ]
Iteration: 10	 Cost: 3.168	 Bias: 0.002	 Weight: [0.044 0.01  0.    0.    0.   ]
Iteration: 20	 Cost: 2.311	 Bias: 0.004	 Weight: [0.073 0.018 0.001 0.    0.   ]
Iteration: 30	 Cost: 1.861	 Bias: 0.005	 Weight: [0.093 0.024 0.001 0.    0.001]
Iteration: 40	 Cost: 1.624	 Bias: 0.006	 Weight: [0.108 0.029 0.001 0.    0.001]
Iteration: 49	 Cost: 1.506	 Bias: 0.007	 Weight: [0.117 0.033 0.001 0.    0.001]


final bias = 0.006613575315281914 	 final weights = [0.1170156  0.03309186 0.00121887 0.00040641 0.00066321]
cost for train data = 1.5064067194021196
cost for test data = 3.0730394701643076
BIC: 500.763015127352


Finally, BIC value is minimum when I add 'mean_smoothness' to the model. Therefore, from the observations, we can finalize 'mean_radius', 'lymph_node_status', 'mean_symmetry', 'mean_fractal_dimension', 'mean_smoothness' (all five)  features to be included in the model.

### b) Backward Stepwise Regression

#### Step 1

In [81]:
k = 11
n = target.shape[0]

In [82]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 411901.299	 Bias: 0.000	 Weight: [0.005 0.    0.031 0.    0.    0.001 0.271 0.    0.006 0.4  ]
Iteration: 10	 Cost: 358163004635974231157546370233979150079038115930125107200.000	 Bias: 7008123840025124143104.000	 Weight: [1.31132009e+23 1.34992972e+21 8.66436995e+23 4.34734760e+20
 2.22619822e+21 1.74267572e+22 7.85230370e+24 7.18762418e+20
 1.61275621e+23 1.18438096e+25]
Iteration: 20	 Cost: 311436906289649647846656280641565230667708186407833998678382604213086228403616667550779492492289552995581952.000	 Bias: 206655250340012240845604031040136366870562340864.000	 Weight: [3.86681498e+48 3.98066687e+46 2.55494563e+49 1.28194396e+46
 6.56460362e+46 5.13879456e+47 2.31548389e+50 2.11948349e+46
 4.75568847e+48 3.49249743e+50]
Iteration: 30	 Cost: 270806714662919584272290414001610179933222128199335362634541159018986516753910849746222893127534662767986905464477790372746129277007601962985745957650004180992.000	 Bias: 60938410

#### Step 2 

Remove every features one by one

In [88]:
k = 10
n = target.shape[0]

remove 'worst_area'

In [89]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 40268.383	 Bias: 0.000	 Weight: [0.005 0.    0.031 0.    0.    0.001 0.271 0.    0.006]
Iteration: 10	 Cost: 2322088968365462571345059694703706979432398848.000	 Bias: 57961819914774176.000	 Weight: [1.08156641e+18 1.11663670e+16 7.14676437e+18 3.59527582e+15
 1.83405617e+16 1.44505155e+17 6.45135564e+19 5.94537784e+15
 1.31934928e+18]
Iteration: 20	 Cost: 133909468181229493491554658092511179627971348568617030199899975059344100461544671805440.000	 Bias: 13919006197636213086345304467912523776.000	 Weight: [2.59728379e+38 2.68150193e+36 1.71623075e+39 8.63372933e+35
 4.40431981e+36 3.47016044e+37 1.54923464e+40 1.42772865e+36
 3.16829782e+38]
Iteration: 30	 Cost: 7722247473231850388652225323674264344032160803883295065248029614376161894017243753305766445614293687399277732798764942553513984.000	 Bias: 3342523299211526557115237785041304185209301894169783959552.000	 Weight: [6.23714184e+58 6.43938407e+56 4.12137273e+59 2.0733

Remove worst_radius

In [90]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 411790.033	 Bias: 0.000	 Weight: [0.005 0.    0.031 0.    0.    0.001 0.271 0.    0.4  ]
Iteration: 10	 Cost: 357145988391351211194374948810679528630457631143977025536.000	 Bias: 6998996501151393775616.000	 Weight: [1.30961823e+23 1.34817133e+21 8.65312586e+23 4.34168060e+20
 2.22329643e+21 1.74040684e+22 7.84214618e+24 7.17826014e+20
 1.18285050e+25]
Iteration: 20	 Cost: 309754418984580381080509196766267391416921035924513968632934981543507597014675471860924652269321970264309760.000	 Bias: 206120716588349313994777937724675946943940132864.000	 Weight: [3.85683073e+48 3.97036976e+46 2.54834890e+49 1.27862661e+46
 6.54761656e+46 5.12550485e+47 2.30951507e+50 2.11400038e+46
 3.48349927e+50]
Iteration: 30	 Cost: 268651484824569006789521256633946083594538271743287059658048913290687572355580408197144634623003063763892974841040289921642409784516287881417140905988822401024.000	 Bias: 60702630441242836350334563655172757339228713

Remove mean_smoothness

In [92]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 411901.296	 Bias: 0.000	 Weight: [0.005 0.    0.031 0.    0.    0.001 0.271 0.006 0.4  ]
Iteration: 10	 Cost: 358162984317082886684449182088679903552362858222551302144.000	 Bias: 7008123655967792431104.000	 Weight: [1.31132006e+23 1.34992968e+21 8.66436972e+23 4.34734748e+20
 2.22619817e+21 1.74267567e+22 7.85230350e+24 1.61275616e+23
 1.18438093e+25]
Iteration: 20	 Cost: 311436872703654603036370220419455243453343086506299971444999709173206776869674475975863883623523230072963072.000	 Bias: 206655239631338110729780130580485796376394858496.000	 Weight: [3.86681478e+48 3.98066666e+46 2.55494550e+49 1.28194390e+46
 6.56460328e+46 5.13879429e+47 2.31548377e+50 4.75568823e+48
 3.49249725e+50]
Iteration: 30	 Cost: 270806671617310812150092738090379802366257536723896011234602131812131442895540603503561265812519258595348144650130445175295424423394573389986758378472232452096.000	 Bias: 60938405432842243552080247095993088052637922

Remove 'mean_area'

In [93]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 196850.978	 Bias: 0.000	 Weight: [0.005 0.    0.031 0.    0.    0.001 0.    0.006 0.4  ]
Iteration: 10	 Cost: 123832179954218214948553153565079028441703519514263552.000	 Bias: 185176746800245964800.000	 Weight: [3.46852724e+21 3.56675974e+19 2.29171014e+22 1.14880877e+19
 5.89289548e+19 4.59934443e+20 1.89913330e+19 4.28132231e+21
 3.16138261e+23]
Iteration: 20	 Cost: 77899248368159981563328663967172293709922578006678570781810422155435552988600877391235678079536857088.000	 Bias: 146871092227036671463786294941873198111653888.000	 Weight: [2.75102783e+45 2.82893996e+43 1.81764707e+46 9.11166232e+42
 4.67389136e+43 3.64792422e+44 1.50627866e+43 3.39568814e+45
 2.50741913e+47]
Iteration: 30	 Cost: 49004167564260896570779309761548024453257807427309546067128756407517024932059646048042924073250519335083829358515782200411338191963856106464584138752.000	 Bias: 11648934385499195463217631512790703449326409425659769028711975118438

Remove 'lymph_node_status'

In [101]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'mean_area', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
# print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 411899.141	 Bias: 0.000	 Weight: [0.005 0.    0.031 0.    0.    0.271 0.    0.006 0.4  ]
Iteration: 10	 Cost: 358150365749550062805245254784152339798878676962806595584.000	 Bias: 7008008868668336242688.000	 Weight: [1.31129874e+23 1.34990758e+21 8.66422889e+23 4.34727612e+20
 2.22616172e+21 7.85217668e+24 7.18750607e+20 1.61272999e+23
 1.18436186e+25]
Iteration: 20	 Cost: 311416559545977514027517481175164361293059981631245494765829141553717863448245881587390117236557562412793856.000	 Bias: 206648755638639794717951063309425792740340269056.000	 Weight: [3.86669391e+48 3.98054179e+46 2.55486566e+49 1.28190363e+46
 6.56439735e+46 2.31541165e+50 2.11941682e+46 4.75553973e+48
 3.49238862e+50]
Iteration: 30	 Cost: 270780886560005468786749009442271581238067079739377054573480308663434709719425561325101454206882033829182762086334132061555170382905612267434492451786772709376.000	 Bias: 60935579573706846995212691785928453284038843

Remove 'worst_symmetry'

In [102]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 411901.276	 Bias: 0.000	 Weight: [0.005 0.    0.031 0.    0.001 0.271 0.    0.006 0.4  ]
Iteration: 10	 Cost: 358162809799939924579413614825492230876958266545657610240.000	 Bias: 7008122074140110749696.000	 Weight: [1.31131977e+23 1.34992938e+21 8.66436778e+23 4.34734650e+20
 1.74267528e+22 7.85230176e+24 7.18762237e+20 1.61275580e+23
 1.18438067e+25]
Iteration: 20	 Cost: 311436584170670806180494044974936626564104495090847562249687231001182529523134117344963859498861168682860544.000	 Bias: 206655147604848574325311713102374468052274118656.000	 Weight: [3.86681306e+48 3.98066489e+46 2.55494437e+49 1.28194333e+46
 5.13879200e+47 2.31548275e+50 2.11948243e+46 4.75568612e+48
 3.49249571e+50]
Iteration: 30	 Cost: 270806301787931102496284041432710725368502689017742395408838124106748127170430155340458809499038472655410029555399838980597399423689075091208087202479948693504.000	 Bias: 60938364914001350615560113600917894122035118

Remove 'mean_fractal_dimension'

In [103]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 411901.298	 Bias: 0.000	 Weight: [0.005 0.    0.031 0.    0.001 0.271 0.    0.006 0.4  ]
Iteration: 10	 Cost: 358162997196018913395093762649465047151831561289601122304.000	 Bias: 7008123772545292828672.000	 Weight: [1.31132008e+23 1.34992970e+21 8.66436987e+23 2.22619820e+21
 1.74267570e+22 7.85230363e+24 7.18762411e+20 1.61275619e+23
 1.18438095e+25]
Iteration: 20	 Cost: 311436893997080564251060942338216960437885975625612053888494614815593113954111438175480108823593755550416896.000	 Bias: 206655246418154448776135042959568359027770392576.000	 Weight: [3.86681490e+48 3.98066679e+46 2.55494559e+49 6.56460350e+46
 5.13879446e+47 2.31548385e+50 2.11948345e+46 4.75568838e+48
 3.49249736e+50]
Iteration: 30	 Cost: 270806698910511866981980051048162466941670318782182487944866329333171331849010299930272159158041642256882153368899776386786534819044449846674813014586960117760.000	 Bias: 60938408421741591873555388591883771438822846

Remove 'mean_perimeter'

In [104]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 408673.793	 Bias: 0.000	 Weight: [0.005 0.    0.    0.    0.001 0.271 0.    0.006 0.4  ]
Iteration: 10	 Cost: 329879158795556001934997933845100270689958504070855524352.000	 Bias: 6748502225097834102784.000	 Weight: [1.26291531e+23 1.29990930e+21 4.18614370e+20 2.14368364e+21
 1.67810692e+22 7.56344647e+24 6.92124770e+20 1.55328939e+23
 1.14090835e+25]
Iteration: 20	 Cost: 266277690589884122528999624278489365404779286955332266236677793086278616636620815263101772666650909640491008.000	 Bias: 191733211656985101491752391644517294488546181120.000	 Weight: [3.58809704e+48 3.69320149e+46 1.18933468e+46 6.09046770e+46
 4.76770725e+47 2.14886775e+50 1.96641122e+46 4.41308535e+48
 3.24146031e+50]
Iteration: 30	 Cost: 214938733216016905336625556534894462373428217563958708146492934558891954369744252566484625713534552211935679027915951954325958328609790393368865952181832908800.000	 Bias: 54473753176793703745963258939639283646541535

Remove 'mean_symmetry'

In [105]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 411901.290	 Bias: 0.000	 Weight: [0.005 0.031 0.    0.    0.001 0.271 0.    0.006 0.4  ]
Iteration: 10	 Cost: 358162933047260465299382135939441078965866608793206390784.000	 Bias: 7008123191993210765312.000	 Weight: [1.31131997e+23 8.66436915e+23 4.34734719e+20 2.22619802e+21
 1.74267556e+22 7.85230299e+24 7.18762351e+20 1.61275606e+23
 1.18438086e+25]
Iteration: 20	 Cost: 311436787891989805216246545623829057317675811937519706457354174966281622236220279563954805634324553560752128.000	 Bias: 206655212602107417612568388893201163570437423104.000	 Weight: [3.86681427e+48 2.55494517e+49 1.28194373e+46 6.56460242e+46
 5.13879362e+47 2.31548347e+50 2.11948310e+46 4.75568761e+48
 3.49249680e+50]
Iteration: 30	 Cost: 270806562888185056758882783456980035011503454177046088097958387403655337794090282030043462131190414467328045809081473542665100347832128365278767394290355142656.000	 Bias: 60938393526549609113498611741482284485104262

Remove 'mean_radius'

In [106]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 411827.022	 Bias: 0.000	 Weight: [0.    0.031 0.    0.    0.001 0.271 0.    0.006 0.4  ]
Iteration: 10	 Cost: 357489705582407049827304728823051934279679614662983286784.000	 Bias: 7002074291010557444096.000	 Weight: [1.34876433e+21 8.65691902e+23 4.34359174e+20 2.22427571e+21
 1.74117087e+22 7.84557472e+24 7.18141797e+20 1.61137076e+23
 1.18336829e+25]
Iteration: 20	 Cost: 310323045945574171133538059788209009598386364319077278548578062726641551755472450790074462428910564967186432.000	 Bias: 206301297234652582814187992999758544049648173056.000	 Weight: [3.97384860e+46 2.55057794e+49 1.27974736e+46 6.55335755e+46
 5.12999141e+47 2.31153252e+50 2.11585279e+46 4.74756286e+48
 3.48654417e+50]
Iteration: 30	 Cost: 269379485174406033015927598205712162861932385885162939742784419331036844745694062201684590478726021979094936523810435953810325705699605764482406791175529824256.000	 Bias: 60782310315302325702199991723014632547113065

The least BIC value is given when we remove the 'worst_area'

#### Step 3

In [109]:
k = 9
n = target.shape[0]

Remove 'worst_radius'

In [110]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.7301047166535294e+96 	 final weights = [-3.22841507e+97 -3.33305170e+95 -2.13326972e+98 -1.07315135e+95
 -5.47445345e+95 -4.31335123e+96 -1.92571648e+99 -1.77463605e+95]
cost for train data = 2.0672964872214308e+204
cost for test data = 2.0016565971607182e+204
BIC: 68672.05487685636


Remove 'mean_smoothness'

In [111]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.765875458054072e+96 	 final weights = [-3.29512010e+97 -3.40196590e+95 -2.17734637e+98 -1.09534334e+95
 -5.58766923e+95 -4.40252061e+96 -1.96548187e+99 -4.01955376e+97]
cost for train data = 2.1553359075360679e+204
cost for test data = 2.0868822783916424e+204
BIC: 68678.10208389643


Remove 'mean_area'

In [112]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0003528969713991515 	 final weights = [3.69147980e-03 8.72839265e-06 2.08468986e-02 1.41939336e-05
 3.31564123e-05 1.86006687e-02 1.81469889e-05 4.75777209e-03]
cost for train data = 1.4428057992878125
cost for test data = 2.7182554659265104
BIC: 509.4382832896177


Remove 'lymph_node_status'

In [113]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'mean_area', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.7654362813038402e+96 	 final weights = [-3.29430181e+97 -3.40111991e+95 -2.17680570e+98 -1.09507080e+95
 -5.58627955e+95 -1.96499443e+99 -1.81087910e+95 -4.01855580e+97]
cost for train data = 2.1542456655714977e+204
cost for test data = 2.0857952863243386e+204
BIC: 68678.02871942447


Remove 'worst_symmetry'

In [114]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.7658691927232507e+96 	 final weights = [-3.29510843e+97 -3.40195382e+95 -2.17733866e+98 -1.09533945e+95
 -4.40250499e+96 -1.96547492e+99 -1.81132331e+95 -4.01953952e+97]
cost for train data = 2.1553203539967356e+204
cost for test data = 2.0868672052720978e+204
BIC: 68678.10103752991


Remove 'mean_fractal_dimension'

In [115]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.7658759244873175e+96 	 final weights = [-3.29512097e+97 -3.40196680e+95 -2.17734694e+98 -5.58767071e+95
 -4.40252177e+96 -1.96548239e+99 -1.81133022e+95 -4.01955482e+97]
cost for train data = 2.1553370658905468e+204
cost for test data = 2.0868834002850455e+204
BIC: 68678.10216182459


Remove 'mean_perimeter'

In [116]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.765876193576873e+96 	 final weights = [-3.29512147e+97 -3.40196732e+95 -2.17734727e+98 -1.09534380e+95
 -5.58767156e+95 -4.40252244e+96 -1.96548269e+99 -1.81133049e+95
 -4.01955543e+97]
cost for train data = 2.155337733892198e+204
cost for test data = 2.0868840471823516e+204
BIC: 68678.1022067643


Remove mean_symmetry

In [117]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.7658736000143315e+96 	 final weights = [-3.29511664e+97 -2.17734408e+98 -1.09534219e+95 -5.58766334e+95
 -4.40251598e+96 -1.96547981e+99 -1.81132783e+95 -4.01954954e+97]
cost for train data = 2.155331293081649e+204
cost for test data = 2.0868778076417684e+204
BIC: 68678.10177345916


Remove 'mean_radius'

In [118]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_area', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.7417548454283567e+96 	 final weights = [-3.35549679e+95 -2.14762618e+98 -1.08037920e+95 -5.51133269e+95
 -4.34238438e+96 -1.93867015e+99 -1.78658695e+95 -3.96468999e+97]
cost for train data = 2.095769947273318e+204
cost for test data = 2.0291805986731424e+204
BIC: 68674.03837443094


Removing 'mean_area' makes more sense on this step

#### Step 4
Forward Step (Add two features removed and compare with the 8 predictor model)

Add 'worst_area' and compare

In [119]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -2.985732947075039e+113 	 final weights = [-5.59254671e+114 -5.75093377e+112 -3.69508299e+115 -1.85230395e+112
 -9.50152355e+112 -7.41584158e+113 -3.06210416e+112 -6.90307249e+114
 -5.09731615e+116]
cost for train data = 3.219308076869949e+239
cost for test data = 2.65445156580725e+239
BIC: 80421.89831177816


In [120]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'mean_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -1.765876193576876e+96 	 final weights = [-3.29512147e+97 -3.40196732e+95 -2.17734727e+98 -1.09534380e+95
 -5.58767156e+95 -4.40252244e+96 -1.81133049e+95 -4.01955543e+97
 -1.96548269e+99]
cost for train data = 2.1553377338922056e+204
cost for test data = 2.086884047182359e+204
BIC: 68678.1022067643


Since adding the removed features does not improve the BIC score. We go on with the 8 predictor model from step 3 and move on to 5

#### Step 5

Remove 'worst_radius'

In [121]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.00035970859687476617 	 final weights = [3.82290555e-03 9.97573005e-06 2.17010893e-02 1.46033321e-05
 3.57427021e-05 1.86108672e-02 1.88113894e-05]
cost for train data = 1.4436669627198118
cost for test data = 2.714916479186959
BIC: 509.52480321316466


Remove 'mean_smoothness'

In [122]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0003528974459961783 	 final weights = [3.69148201e-03 8.72850043e-06 2.08469135e-02 1.41939738e-05
 3.31566111e-05 1.86006701e-02 4.75777449e-03]
cost for train data = 1.4428057924701554
cost for test data = 2.718255659787015
BIC: 509.4382826044525


Remove 'lymph_node_status'

In [124]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.00036044760745601503 	 final weights = [3.75240851e-03 1.00411339e-05 2.12108869e-02 1.47490751e-05
 3.52497827e-05 1.90647578e-05 4.81085717e-03]
cost for train data = 1.5104632996841465
cost for test data = 2.91331494650073
BIC: 516.0831598998583


Remove 'worst_symmetry'

In [125]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0003529006421880398 	 final weights = [3.69149248e-03 8.72943185e-06 2.08469828e-02 1.41942589e-05
 1.86006752e-02 1.81474553e-05 4.75779045e-03]
cost for train data = 1.4428056151342041
cost for test data = 2.7182564907464397
BIC: 509.4382647824319


Remove 'mean_fractal_dimension'

In [126]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_perimeter', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0003528971985551293 	 final weights = [3.69148085e-03 8.72844361e-06 2.08469057e-02 3.31565099e-05
 1.86006693e-02 1.81470174e-05 4.75777326e-03]
cost for train data = 1.4428058052205013
cost for test data = 2.7182555690100636
BIC: 509.4382838858447


Remove 'mean_perimeter'

In [127]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0033757057638676546 	 final weights = [0.05832819 0.00059405 0.00020382 0.00099473 0.02553939 0.00033012
 0.07058408]
cost for train data = 1.408563974574644
cost for test data = 2.734188425083715
BIC: 505.955533822


Remove 'mean_symmetry'

In [128]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.00035289779366688386 	 final weights = [3.69148179e-03 2.08469127e-02 1.41940088e-05 3.31568835e-05
 1.86006705e-02 1.81471005e-05 4.75777392e-03]
cost for train data = 1.4428057479852026
cost for test data = 2.718255558331449
BIC: 509.4382781337758


Remove 'mean_radius'

In [129]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_symmetry', 'mean_perimeter', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.00035770726814696966 	 final weights = [9.58228430e-06 2.13861730e-02 1.44730622e-05 3.45878450e-05
 1.86128852e-02 1.86068878e-05 4.85758262e-03]
cost for train data = 1.443136640878513
cost for test data = 2.7179984747386183
BIC: 509.47152860331073


Since removing 'mean_perimeter' results in least BIC we are removing that feature from the model

#### Step 6
Forward step (add each of the 3 removed model and compare with 7 predictor model) 

Add worst_area

In [130]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'worst_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -2.3052274482595947e+113 	 final weights = [-4.31877956e+114 -4.44013004e+112 -1.43005418e+112 -7.33580801e+112
 -5.72554267e+113 -2.36414011e+112 -5.33124310e+114 -3.93775084e+116]
cost for train data = 1.9011902185278147e+239
cost for test data = 1.565668062030281e+239
BIC: 80345.52879354314


Add 'mean_area'

In [131]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'mean_area', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = -9.653695090998901e+95 	 final weights = [-1.80211639e+97 -1.85973986e+95 -5.98731551e+94 -3.05431279e+95
 -2.40680766e+96 -9.90171542e+94 -2.19838223e+97 -1.07533719e+99]
cost for train data = 6.296234428820404e+203
cost for test data = 6.092829896001675e+203
BIC: 68499.66799361396


In [132]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius','mean_perimeter', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0003528969713991515 	 final weights = [3.69147980e-03 8.72839265e-06 1.41939336e-05 3.31564123e-05
 1.86006687e-02 1.81469889e-05 4.75777209e-03 2.08468986e-02]
cost for train data = 1.4428057992878123
cost for test data = 2.7182554659265104
BIC: 509.43828328961763


Since adding the removed features does not improve the BIC score we move on to the next step

#### Step 7

Remove 'worst_radius'

In [133]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.006612322651295565 	 final weights = [0.11699446 0.00121862 0.00040633 0.00203407 0.03308894 0.00066308]
cost for train data = 1.5063029561682175
cost for test data = 3.07268172332279
BIC: 515.6832282239659


Remove 'mean_smoothness'

In [134]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.006612322651295565 	 final weights = [0.11699446 0.00121862 0.00040633 0.00203407 0.03308894 0.00066308]
cost for train data = 1.5063029561682175
cost for test data = 3.07268172332279
BIC: 515.6832282239659


Remove 'lymph_node_status'

In [135]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.003446864918259788 	 final weights = [0.05949498 0.00060758 0.00020841 0.00101691 0.00033768 0.07196272]
cost for train data = 1.500044495046233
cost for test data = 3.0182876707422017
BIC: 515.079520110834


Remove 'worst_symmetry'

In [136]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0033761207372361955 	 final weights = [0.05833476 0.00059414 0.00020385 0.02554033 0.00033017 0.07059208]
cost for train data = 1.4085550700023655
cost for test data = 2.7342372502039813
BIC: 505.9546171671054


Remove 'mean_fractal_dimension'

In [137]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.003375722395664869 	 final weights = [0.05832845 0.00059405 0.00099474 0.02553943 0.00033013 0.0705844 ]
cost for train data = 1.408563981556191
cost for test data = 2.7341905932219173
BIC: 505.95553454069244


Remove 'mean_symmetry'

In [138]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0033758532217966087 	 final weights = [0.05833058 0.00020383 0.00099478 0.02553973 0.00033014 0.07058696]
cost for train data = 1.4085600104240392
cost for test data = 2.7342047800950255
BIC: 505.9551257449393


Remove 'mean_radius'

In [139]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.005144626213294888 	 final weights = [0.00093487 0.00031431 0.00155638 0.02973212 0.0005119  0.10914453]
cost for train data = 1.4265219636087012
cost for test data = 2.905855786304988
BIC: 507.7924751297801


Since none of the removal result in better model. I am going with 7 predictor model which includes features: 'mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius',

### Question 3 c

**model comparision**

final model from backward stepwise

In [141]:
#the final column will be your target value when you use 'seperate_target(data) method
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.003537260257371273 	 final weights = [0.05869724 0.00056853 0.00020645 0.00096274 0.04096097 0.00032939
 0.07046306]
cost for train data = 1.3604815492065987
cost for test data = 2.5553141133176283
BIC: 500.91939233055797


Final model from forward stepwise

In [142]:
columns = ['mean_radius', 'lymph_node_status', 'mean_symmetry', 'mean_fractal_dimension', 'mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.007906412852465121 	 final weights = [0.13812438 0.05019468 0.0014123  0.00047995 0.00077893]
cost for train data = 1.33702992151285
cost for test data = 2.4930936537716972
BIC: 498.39812654286203


From the comparision of BIC as well as cost the model obtained from forward stepwise regression is performing better

### Question 3 d

Model from Question 2

In [147]:
columns = ['mean_texture', 'lymph_node_status', 'tumor_size'] #the final column will be your target column when you use 'seperate_target(data) method'
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.012535865642508252 	 final weights = [0.09669292 0.15458462]
cost for train data = 1.3228053271253322
cost for test data = 1.831997478927214
BIC: 496.8472140561521


Model from Question 3c

In [146]:
columns = ['mean_radius', 'lymph_node_status', 'mean_symmetry', 'mean_fractal_dimension', 'mean_smoothness', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.007906412852465121 	 final weights = [0.13812438 0.05019468 0.0014123  0.00047995 0.00077893]
cost for train data = 1.33702992151285
cost for test data = 2.4930936537716972
BIC: 498.39812654286203


## Question 4

#### a) Regularization

In [177]:
columns = ['mean_texture', 'lymph_node_status', 'tumor_size'] #the final column will be your target column when you use 'seperate_target(data) method'
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.001
iterations = 100
lam = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.regularized_gradient_descent(features, target, bias, weights, lam)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37
Iteration: 0	 Cost: 2.353	 Bias: 0.003	 Weight: [0.06 0.01]
Iteration: 10	 Cost: 1.494	 Bias: 0.006	 Weight: [0.109 0.048]
Iteration: 20	 Cost: 1.430	 Bias: 0.006	 Weight: [0.106 0.073]
Iteration: 30	 Cost: 1.389	 Bias: 0.007	 Weight: [0.104 0.093]
Iteration: 40	 Cost: 1.364	 Bias: 0.008	 Weight: [0.102 0.108]
Iteration: 50	 Cost: 1.349	 Bias: 0.009	 Weight: [0.101 0.121]
Iteration: 60	 Cost: 1.339	 Bias: 0.010	 Weight: [0.099 0.13 ]
Iteration: 70	 Cost: 1.332	 Bias: 0.011	 Weight: [0.099 0.138]
Iteration: 80	 Cost: 1.328	 Bias: 0.011	 Weight: [0.098 0.144]
Iteration: 90	 Cost: 1.326	 Bias: 0.012	 Weight: [0.097 0.148]
Iteration: 99	 Cost: 1.324	 Bias: 0.013	 Weight: [0.097 0.151]


final bias = 0.012852630863148579 	 final weights = [0.09688136 0.15146141]
cost for train data = 1.323995841745698
cost for test data = 1.8416977312427831
BIC: 496.9776542521326


**not performing better than model of question 2**

#### b) Feature Scaling

In [188]:
#first seperate columns by passing column names and after required columns are obtained from the data we split 80% of data as training data
#index of the columns defined will be the same index on the array as well
columns = ['mean_texture', 'lymph_node_status', 'tumor_size']#the final column will be the target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)

# now seperate features and target as two seperate arrays with each features index corresponding to target index
features, target = df.seperate_target(train_data)
scaled_features = df.scale_features(features)

test_features, test_target = df.seperate_target(test_data)
scaled_test_features = df.scale_features(test_features)

total training rows: 145
total testing rows: 37


In [190]:
bias = 0
weights = np.zeros(scaled_features.shape[1])
learning_rate = 0.0001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(scaled_features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(scaled_features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(scaled_test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(scaled_features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")



final bias = 0.026331508580144967 	 final weights = [4.15278206e-05 1.38796268e-03]
cost for train data = 4.972647652174251
cost for test data = 8.629522301546837
BIC: 688.8558799442048


Feature scale even worsened the performance of the model from 3 d

## Extra test

Lets remove the extra features one by one that we ended up with while comparing to forward stepwise regression

remove worst_symmetry

In [191]:
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'lymph_node_status', 'mean_smoothness', 'worst_radius', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 50
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.0033761207372361955 	 final weights = [0.05833476 0.00059414 0.00020385 0.02554033 0.00033017 0.07059208]
cost for train data = 1.4085550700023655
cost for test data = 2.7342372502039813
BIC: 505.9546171671054


Remove worst_radius

In [192]:
columns = ['mean_radius', 'mean_symmetry', 'mean_fractal_dimension', 'worst_symmetry', 'lymph_node_status', 'mean_smoothness', 'tumor_size'] 

sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.0001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
bic = calculate_bic(k, n, MSE_train)
print(f"BIC: {bic}")

total training rows: 145
total testing rows: 37


final bias = 0.007904191214125781 	 final weights = [0.13808835 0.00141185 0.0004798  0.00237189 0.05018994 0.0007787 ]
cost for train data = 1.3370522151813151
cost for test data = 2.492977379102992
BIC: 498.4005442561275


Still, the model from forward stepwise regression performs better than thhe model from backward stepwise regression