In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import math
from decimal import Decimal

In [2]:
class ManipulateData:
    
    def __init__(self, file):
        self.file = file

    def filter_data(self):
        data = pd.read_csv(self.file).dropna()
        data = data.sample(frac = 1) #shuffling
        return data

    def seperate_columns(self, data, columns):
        data = data[columns]
        return data
    
    def split_data(self, data, percent):
        data_array = data.to_numpy()
        np.random.shuffle(data_array)
        train_size = math.floor(len(data_array)*percent)
        train_data, test_data = data_array[:train_size,:], data_array[train_size:,:]
        train_length = train_data.shape[0]
        test_length = test_data.shape[0]
        print(f"total training rows: {train_length}\ntotal testing rows: {test_length}")
        return train_data, test_data

    def seperate_target(self, data):
        rows = data.shape[0]
        features = []
        target = []
        for row in range(rows):
            features.append(data[row, :-1])
            target.append(data[row, -1])
        return np.array(features), np.array(target)
        
    

In [3]:
class MSE:
    # calculating cost using vectorization('np.dot()')
    def calculate_cost(self, X_train, Y_train, th0, thj):
            
            m = X_train.shape[0]
            
            y_predict = th0 + np.dot(X_train, thj)
            sq_difference = (y_predict - Y_train) ** 2
            total_cost = np.sum(sq_difference) / (2*m)
        
            return total_cost
        

In [4]:
class LinearRegression(MSE):
    def __init__(self, alpha, iterations):
        super().__init__()
        self.alpha = alpha
        self.iterations = iterations

    def calculate_gradient(self, X_train, Y_train, th0, thj):
        m = X_train.shape[0]
        
        y_predict = th0 + np.dot(X_train, thj)

        difference = y_predict - Y_train
        
        gradient_th0 = np.sum(difference) / m
        gradient_thj = np.dot(difference, X_train) / m

        return gradient_th0, gradient_thj
    
            
    def gradient_descent(self, X_train, Y_train, th0, thj):
        
        for i in range(self.iterations):
            gradients = self.calculate_gradient(X_train, Y_train, th0, thj)
            th0 = th0 - (self.alpha * gradients[0])
            thj = thj - (self.alpha * gradients[1])

            if i%10 == 0 or i == self.iterations - 1:
                current_cost = self.calculate_cost(X_train, Y_train, th0, thj)
                print(f"Iteration: {i}\t Cost: {current_cost:.3f}\t Bias: {th0:.3f}\t Weight: {np.round(thj, 3)}")
            
        return th0, thj
            
        

In [5]:
file = 'Cancer_dataset.csv'

df = ManipulateData(file)
data = df.filter_data() #cleans out rows containing null values and shuffles the data

## Question 1

In [6]:
columns = ['mean_texture', 'tumor_size'] #the final column will be your target value when you use 'seperate_target(data) method
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

total training rows: 145
total testing rows: 37


In [7]:
bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.001
iterations = 200
fit = LinearRegression(learning_rate, iterations)
# fit.calculate_gradient(features, target, bias, weights)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

Iteration: 0	 Cost: 2.327	 Bias: 0.003	 Weight: [0.061]
Iteration: 10	 Cost: 1.526	 Bias: 0.006	 Weight: [0.116]
Iteration: 20	 Cost: 1.526	 Bias: 0.006	 Weight: [0.116]
Iteration: 30	 Cost: 1.526	 Bias: 0.007	 Weight: [0.116]
Iteration: 40	 Cost: 1.526	 Bias: 0.008	 Weight: [0.116]
Iteration: 50	 Cost: 1.526	 Bias: 0.009	 Weight: [0.116]
Iteration: 60	 Cost: 1.526	 Bias: 0.009	 Weight: [0.116]
Iteration: 70	 Cost: 1.526	 Bias: 0.010	 Weight: [0.116]
Iteration: 80	 Cost: 1.526	 Bias: 0.011	 Weight: [0.116]
Iteration: 90	 Cost: 1.526	 Bias: 0.011	 Weight: [0.116]
Iteration: 100	 Cost: 1.526	 Bias: 0.012	 Weight: [0.116]
Iteration: 110	 Cost: 1.526	 Bias: 0.013	 Weight: [0.116]
Iteration: 120	 Cost: 1.526	 Bias: 0.013	 Weight: [0.116]
Iteration: 130	 Cost: 1.526	 Bias: 0.014	 Weight: [0.116]
Iteration: 140	 Cost: 1.526	 Bias: 0.015	 Weight: [0.116]
Iteration: 150	 Cost: 1.525	 Bias: 0.016	 Weight: [0.116]
Iteration: 160	 Cost: 1.525	 Bias: 0.016	 Weight: [0.116]
Iteration: 170	 Cost: 1.5

In [8]:
mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

cost for train data = 1.5252175116261217
cost for test data = 3.3619999842979245


## Question 2

In [9]:
columns = ['mean_texture', 'lymph_node_status', 'tumor_size'] #the final column will be your target column when you use 'seperate_target(data) method'
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

total training rows: 145
total testing rows: 37


In [10]:
bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

Iteration: 0	 Cost: 2.981	 Bias: 0.003	 Weight: [0.068 0.015]
Iteration: 10	 Cost: 1.812	 Bias: 0.006	 Weight: [0.12 0.07]
Iteration: 20	 Cost: 1.686	 Bias: 0.007	 Weight: [0.115 0.105]
Iteration: 30	 Cost: 1.613	 Bias: 0.007	 Weight: [0.111 0.132]
Iteration: 40	 Cost: 1.571	 Bias: 0.008	 Weight: [0.108 0.152]
Iteration: 50	 Cost: 1.547	 Bias: 0.009	 Weight: [0.106 0.168]
Iteration: 60	 Cost: 1.533	 Bias: 0.010	 Weight: [0.104 0.18 ]
Iteration: 70	 Cost: 1.525	 Bias: 0.010	 Weight: [0.103 0.188]
Iteration: 80	 Cost: 1.520	 Bias: 0.011	 Weight: [0.102 0.195]
Iteration: 90	 Cost: 1.517	 Bias: 0.012	 Weight: [0.101 0.2  ]
Iteration: 99	 Cost: 1.516	 Bias: 0.013	 Weight: [0.1   0.204]


final bias = 0.01263073395008496 	 final weights = [0.1002761  0.20394332]


In [11]:
mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

cost for train data = 1.5157785695611707
cost for test data = 1.1074969966172434


## Question 3

#### Step1

In [12]:
columns = ['mean_radius', 'tumor_size'] #the final column will be your target column when you use 'seperate_target(data) method'
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

total training rows: 145
total testing rows: 37


In [13]:
bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

Iteration: 0	 Cost: 4.063	 Bias: 0.003	 Weight: [0.053]
Iteration: 10	 Cost: 1.999	 Bias: 0.010	 Weight: [0.164]
Iteration: 20	 Cost: 1.998	 Bias: 0.010	 Weight: [0.167]
Iteration: 30	 Cost: 1.998	 Bias: 0.010	 Weight: [0.167]
Iteration: 40	 Cost: 1.998	 Bias: 0.011	 Weight: [0.167]
Iteration: 50	 Cost: 1.998	 Bias: 0.011	 Weight: [0.167]
Iteration: 60	 Cost: 1.998	 Bias: 0.012	 Weight: [0.167]
Iteration: 70	 Cost: 1.998	 Bias: 0.012	 Weight: [0.167]
Iteration: 80	 Cost: 1.998	 Bias: 0.012	 Weight: [0.167]
Iteration: 90	 Cost: 1.998	 Bias: 0.013	 Weight: [0.167]
Iteration: 99	 Cost: 1.998	 Bias: 0.013	 Weight: [0.167]


final bias = 0.013037751461000552 	 final weights = [0.16667702]


In [14]:
mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

cost for train data = 1.99779128559342
cost for test data = 0.9268725548302201


### Step 2

In [15]:
columns = ['mean_radius', 'mean_smoothness', 'tumor_size'] #the final column will be your target column when you use 'seperate_target(data) method'
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

total training rows: 145
total testing rows: 37


In [16]:
bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

Iteration: 0	 Cost: 3.597	 Bias: 0.003	 Weight: [0.05 0.  ]
Iteration: 10	 Cost: 1.753	 Bias: 0.009	 Weight: [0.155 0.001]
Iteration: 20	 Cost: 1.752	 Bias: 0.009	 Weight: [0.158 0.001]
Iteration: 30	 Cost: 1.752	 Bias: 0.010	 Weight: [0.158 0.001]
Iteration: 40	 Cost: 1.752	 Bias: 0.010	 Weight: [0.158 0.001]
Iteration: 50	 Cost: 1.752	 Bias: 0.010	 Weight: [0.158 0.001]
Iteration: 60	 Cost: 1.752	 Bias: 0.010	 Weight: [0.158 0.001]
Iteration: 70	 Cost: 1.752	 Bias: 0.011	 Weight: [0.158 0.001]
Iteration: 80	 Cost: 1.752	 Bias: 0.011	 Weight: [0.158 0.001]
Iteration: 90	 Cost: 1.752	 Bias: 0.011	 Weight: [0.158 0.001]
Iteration: 99	 Cost: 1.752	 Bias: 0.011	 Weight: [0.158 0.001]


final bias = 0.011428564434892615 	 final weights = [0.15760473 0.00095352]


In [17]:
mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

cost for train data = 1.7523892144314996
cost for test data = 1.8433812829175584


In [18]:
# Okay, adding 'mean_perimeter' feature to the model is not performing well to 

In [19]:
columns = ['mean_radius', 'mean_smoothness', 'mean_symmetry', 'worst_radius', 'worst_symmetry', 'lymph_node_status', 'tumor_size'] #the final column will be your target column when you use 'seperate_target(data) method'
sprtd_data = df.seperate_columns(data, columns)

split_percent = 0.8
train_data, test_data = df.split_data(sprtd_data, split_percent)
features, target = df.seperate_target(train_data)
test_features, test_target = df.seperate_target(test_data)

total training rows: 145
total testing rows: 37


In [20]:
bias = 0
weights = np.zeros(features.shape[1])
learning_rate = 0.001
iterations = 100
fit = LinearRegression(learning_rate, iterations)
prdtd_bias, prdtd_weights = fit.gradient_descent(features, target, bias, weights)
print(f"\n\nfinal bias = {prdtd_bias} \t final weights = {prdtd_weights}")

Iteration: 0	 Cost: 1.941	 Bias: 0.003	 Weight: [0.05  0.    0.001 0.061 0.001 0.016]
Iteration: 10	 Cost: 1.569	 Bias: 0.004	 Weight: [0.06  0.    0.001 0.071 0.001 0.063]
Iteration: 20	 Cost: 1.466	 Bias: 0.004	 Weight: [0.058 0.    0.001 0.068 0.001 0.095]
Iteration: 30	 Cost: 1.414	 Bias: 0.004	 Weight: [0.057 0.    0.001 0.065 0.001 0.118]
Iteration: 40	 Cost: 1.387	 Bias: 0.005	 Weight: [0.056 0.    0.001 0.063 0.001 0.134]
Iteration: 50	 Cost: 1.373	 Bias: 0.005	 Weight: [0.055 0.    0.001 0.062 0.001 0.146]
Iteration: 60	 Cost: 1.366	 Bias: 0.005	 Weight: [0.054 0.    0.001 0.061 0.001 0.154]
Iteration: 70	 Cost: 1.362	 Bias: 0.006	 Weight: [0.054 0.    0.001 0.06  0.001 0.16 ]
Iteration: 80	 Cost: 1.361	 Bias: 0.006	 Weight: [0.054 0.    0.001 0.06  0.001 0.164]
Iteration: 90	 Cost: 1.360	 Bias: 0.006	 Weight: [0.054 0.    0.001 0.06  0.001 0.167]
Iteration: 99	 Cost: 1.359	 Bias: 0.007	 Weight: [0.053 0.    0.001 0.059 0.001 0.169]


final bias = 0.0067631143949097245 	 final

In [21]:
mse = MSE()
MSE_train = mse.calculate_cost(features, target, prdtd_bias, prdtd_weights)
MSE_test = mse.calculate_cost(test_features, test_target, prdtd_bias, prdtd_weights)
print(f"cost for train data = {MSE_train}\ncost for test data = {MSE_test}")

cost for train data = 1.3592339450001873
cost for test data = 1.1680655736097192
