#### Breast Cancer Prediction 


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
data = pd.read_csv('../database/breast_cancer_wisconsin.csv')
print(data.head(10))


         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   
5    843786         M        12.45         15.70           82.57      477.1   
6    844359         M        18.25         19.98          119.60     1040.0   
7  84458202         M        13.71         20.83           90.20      577.9   
8    844981         M        13.00         21.82           87.50      519.8   
9  84501001         M        12.46         24.04           83.97      475.9   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760         0.30010 

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [13]:
# drop the id column and the unnamed: 32
data.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)
# 'M' stands for malign thus convert the column to a numerical value
data['diagnosis'] = [1 if row == 'M' else 0 for row in data['diagnosis']]

y = data['diagnosis'].values
X = data.drop(['diagnosis'], axis=1)

# Normalizing the data
X_Norm = (X - np.min(X)) / (np.max(X) - np.min(X))

# splitting the data for training and test
from sklearn.model_selection import train_test_split
xtrain, ytrain, xtest, ytest = train_test_split(X_Norm, y, test_size=.2, random_state=42)
xtrain = xtrain.T
xtest = xtest.T
ytrain = ytrain.T
ytest = ytest.T

class Predictor():
    def __init__(self, dimension, learning_rate=.01, iterations=1000):
        self.theta = np.full((dimension, 1), 0.01)
        self.bias = 0.0
        self.iterations = iterations
        self.learning_rate = learning_rate


    
    def sigmoid(value):
        return 1 / (1 + np.exp(-value))
    

    # Forward - Backward propagation
    def forward_backward(self, X: np.ndarray, y:np.ndarray):
        linear_pred = np.dot(self.theta.T, X) + self.bias
        pred = Predictor.sigmoid(linear_pred)
        loss = - pred * np.log(pred) - (1 - y)*np.log(1- pred)
        cost = np.sum(loss) / X.shape[1]

        # backward propagation
        dtheta = np.dot(X, (pred - y).T) / X.shape[1]
        dbias = np.dot(X, (pred-y).T) / X.shape[1]
        gradient = {
            'derivative theta': dtheta,
            'detivative bias': dbias
        }
        return cost, gradient

    def fit_params(self, X:np.ndarray, y:np.ndarray):
        cost_list = []
        cost_list2 = []
        index = []

        # updating params
        for _ in range(self.iterations):
            cost, gradient = self.forward_backward(X, y)
            cost_list.append(cost)

            # update 
            self.theta = self.theta - self.learning_rate * gradient['derivative_theta']
            self.bias = self.bias - self.learning_rate * gradient['derivative_bias']

            if _ % 10 == 0:
                cost_list2.append(cost)
                index.append(_)
                print(f"Cost after iteration {_}: {cost}")
            
        plt.plot(index, cost_list2)
        plt.xticks(index, rotation='vertical')
        plt.xlabel('Number of Iterations')
        plt.ylabel('Cost')
        plt.show()
        return gradient, cost_list

    def predict(self, X:np.ndarray):
        linear_val = np.dot(self.theta.T, X) + self.bias
        continuous_val = Predictor.sigmoid(linear_val)
        pred = np.zeros((1, X.shape[1]))
        for _ in range(continuous_val.shape[1]):
            if continuous_val[0, _] <= 0.5:
                pred[0, _] = 0
            else:
                pred[0, _] = 1
        return pred

def main():
    model = Predictor(xtrain.shape[0])
    
    gradients, cost = model.fit_params(xtrain, ytrain)
    pred = model.predict(xtest)

    print(f"test accuracy {(100 - np.mean(np.abs(pred - ytest))) * 100}")

main()


ValueError: Unable to coerce to DataFrame, shape must be (30, 114): given (1, 455)