# Dataset information

Wisconsin Breast Cancer Database (January 8, 1991)

This breast cancer database was obtained from the University of Wisconsin Hospitals, Madison from Dr. William H.Wolberg.


## Attributes
Attributes 2 through 10 have been used to represent instances. Each instance has one of 2 possible classes: benign or malignant.


### Citations
1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear 
      programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.

2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of 
      pattern separation for medical diagnosis applied to breast cytology", 
      Proceedings of the National Academy of Sciences, U.S.A., Volume 87, 
      December 1990, pp 9193-9196.

3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern recognition 
      via linear programming: Theory and application to medical diagnosis", 
      in: "Large-scale numerical optimization", Thomas F. Coleman and Yuying
      Li, editors, SIAM Publications, Philadelphia 1990, pp 22-30.

4. K. P. Bennett & O. L. Mangasarian: "Robust linear programming 
      discrimination of two linearly inseparable sets", Optimization Methods
      and Software 1, 1992, 23-34 (Gordon & Breach Science Publishers).




In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore log/zero issue 
np.seterr(divide = 'ignore') 

data = pd.read_csv("bcw.csv", sep=',', 
                  names=["Id_number", "Clump_thickness", "Uniformity_of_cell_size", "Uniformity_of_cell_shape",'Marginal_Adhesion','Single_Epithelial_cell_size','Bare_Nuclei','Bland_Chromatin','Normal_Nucleoli','Mitoses','Class'])

data.head()

Unnamed: 0,Id_number,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_Adhesion,Single_Epithelial_cell_size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [2]:
# Removing the ? values
data = data.replace(to_replace=['?'],value=0)
# 2 = Benign -> now 0
data = data.replace(to_replace=[2],value=0)
# 4 = Malignant -> now 1 
data = data.replace(to_replace=[4],value=1)

data.describe()

Unnamed: 0,Id_number,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_Adhesion,Single_Epithelial_cell_size,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,3.93133,2.834049,2.849785,2.499285,1.905579,2.79113,2.686695,1.437768,0.344778
std,617095.7,3.160624,3.160635,3.117385,2.957374,2.894954,2.858032,3.119948,1.715446,0.475636
min,61634.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,870688.5,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
50%,1171710.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
75%,1238298.0,6.0,5.0,5.0,3.0,3.0,5.0,3.0,1.0,1.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


In [3]:
# Array it
data = np.array(data)
# Make all values int
data =data.astype(int)

X = data[:,1:10]
y = data[:,10:11]

# Add a column of 1's (x_0) to X
m = y.size
x_0 = np.ones((m,1))
X = np.hstack((x_0, X))

In [4]:
# Split data, 20% for testing
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [5]:
# Checking accuracy w/sklearn.metrics

from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

def accuracy_check(true_y, pred_y):
    results = confusion_matrix(true_y, pred_y) 
    print ('Confusion Matrix :')
    print(results) 
    print ('Accuracy Score :',accuracy_score(true_y, pred_y))
    print ('Report : ')
    print (classification_report(true_y, pred_y))

In [6]:
class logistic_regression:
    ''' Performs logistic regression, requires intercept columns to be added to X (A columns of 1's).
        Main -> .fit / .make_prediction -> Self explanatory :)
    '''
    def __init__(self, learning_rate, iterations):
        self.learning_rate = learning_rate
        self.iterations = iterations

    def sigmoid(self,z):
        # Sigmoid function
        return 1/(1+np.exp(-z))
    
    def loss(self, y_pred, y):
        # Cross Entropy Loss
        return(-y * np.log(y_pred) - (1-y) * np.log(1-y_pred)).mean()
    
    def fit(self, X,y):
        # Initialize coefficients 
        self.theta = np.ones(X.shape[1])
        
        # Stores previous costs
        lst = []
        
        for i in range(0,self.iterations):
            
            # Generate hypothesis
            z = np.dot(X, self.theta)
            y_pred = self.sigmoid(z)
            
            # Loss 
            loss = y_pred - y.ravel()
            
            # Gradient - Gradient Descent
            gradient = (1/m) * np.dot(X.T, loss)
            
            # Updating Coefficients - notice the Gradient Ascent being inverted (taking a step back)
            self.theta = self.theta - self.learning_rate * gradient
            
            # Error / Cost
            error = (-y * np.log(y_pred)) - ((1-y)*np.log(1-y_pred))
            cost = 1/m * np.sum(error)
            
            # Prints cost every now and then
            if i%1000 ==0:
                print("Cost:", cost)
            
            # Index for list
            b = i - 1
            lst.append(cost)
            
            # If the last item in the list is less than the cost, convergence has been achieved.
            if lst[b]<cost:break
            

                
        return self.theta, np.where(y_pred <= 0.62, 0, 1)
    
    def make_prediction(self, X, coeff):
        z = np.dot(X, coeff)
        h =  1/(1+np.exp(-z))
        # 0.62 is arbitrary - usually the decision boundary is 0.5
        return np.where(h <= 0.62, 0, 1)
    


In [7]:
# Initialize Object
clf = logistic_regression(learning_rate=0.001, iterations = 9000)

# Getting the adjusted parameters and Y_pred to test accuracy
coeff, y_predicted_train = clf.fit(xTrain, yTrain)

# Getting 
y_predicted_test = clf.make_prediction(xTest, coeff)



Cost: nan
Cost: 2248.818076692533
Cost: 574.6715685471518
Cost: 505.68176647916187
Cost: 470.95572132249924
Cost: 456.73338491488397


In [8]:
print("Accuracy on train data:")
accuracy_check(yTrain, y_predicted_train)

Accuracy on train data:
Confusion Matrix :
[[348  25]
 [ 33 153]]
Accuracy Score : 0.8962432915921288
Report : 
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       373
           1       0.86      0.82      0.84       186

   micro avg       0.90      0.90      0.90       559
   macro avg       0.89      0.88      0.88       559
weighted avg       0.90      0.90      0.90       559



In [9]:
print("Accuracy on test data:")
accuracy_check(yTest, y_predicted_test)

Accuracy on test data:
Confusion Matrix :
[[79  6]
 [13 42]]
Accuracy Score : 0.8642857142857143
Report : 
              precision    recall  f1-score   support

           0       0.86      0.93      0.89        85
           1       0.88      0.76      0.82        55

   micro avg       0.86      0.86      0.86       140
   macro avg       0.87      0.85      0.85       140
weighted avg       0.87      0.86      0.86       140



# Sklearn Implementation



In [10]:
from sklearn.linear_model import LogisticRegression

# Import data
X2 = data[:,1:10]
y2 = data[:,10:11]

xTrain, xTest, yTrain, yTest = train_test_split(X2, y2, test_size = 0.2, random_state = 0)

# Initialize Object
logisticRegr = LogisticRegression()


logisticRegr.fit(xTrain, yTrain.ravel())

train_predictions = logisticRegr.predict(xTrain)
test_predictions = logisticRegr.predict(xTest)



In [11]:
# Accuracy on train data
accuracy_check(yTrain, train_predictions)

Confusion Matrix :
[[367   6]
 [ 12 174]]
Accuracy Score : 0.9677996422182469
Report : 
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       373
           1       0.97      0.94      0.95       186

   micro avg       0.97      0.97      0.97       559
   macro avg       0.97      0.96      0.96       559
weighted avg       0.97      0.97      0.97       559



In [12]:
# Accuracy on train data
accuracy_check(yTest, test_predictions)

Confusion Matrix :
[[83  2]
 [ 1 54]]
Accuracy Score : 0.9785714285714285
Report : 
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        85
           1       0.96      0.98      0.97        55

   micro avg       0.98      0.98      0.98       140
   macro avg       0.98      0.98      0.98       140
weighted avg       0.98      0.98      0.98       140

