In [189]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import datasets, model_selection, preprocessing

In [190]:
dataset = datasets.load_breast_cancer()
# dataset = datasets.load_iris()

data = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)

data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [191]:
# label all items in the dataset

# print(dataset.target_names)
# print(dataset.target)
data['type'] = [dataset.target_names[idx] for idx in dataset.target]
data.head() # the last column "type" was added

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [192]:
X = dataset.data
y = dataset.target 

# data = np.loadtxt('data.csv')
# X = data[:, 0:-1]
# y = data[:, -1]

feature_amount = X.shape[1]

# X_std = X.std(axis=0)
# X_mean = X.mean(axis=0)
# X_norm = (X - X_mean) / X_std 

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3)
sc = preprocessing.StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

train_ones = np.ones((X_train.shape[0], 1))
X_train = np.hstack((X_train, train_ones))

test_ones = np.ones((X_test.shape[0], 1))
X_test = np.hstack((X_test, test_ones))
print(X_train.shape)
# print(y_train.shape)
print(X_test.shape)
# print(y_test.shape)
new_y_train = []
for each in y_train:
    new_y_train.append([each])
y_train = np.array(new_y_train)

(398, 31)
(171, 31)


In [205]:
def sigmoid(z):
    s = 1 / (1 + np.exp(-z))
    return s

def predict(theta, X):
    z = X.dot(theta)
    return sigmoid(z)

def get_cost_value(y, y_pred):
    sample_amount = y.shape[0]
    value_sum = 0
    for i in range(sample_amount):
        value_sum += y[i] * np.log(y_pred[i]) + (1 - y[i]) * np.log(1 - y_pred[i])
    return (1.0 / sample_amount) * value_sum[0] 

def update(theta, X, y, y_pred, alpha=0.05):
    sample_amount = X.shape[0]
    dtheta = X.T.dot(y - y_pred) / sample_amount
    return theta + alpha * dtheta

def get_accuracy(y, y_pred):
    count = 0
    for i in range(y_pred.shape[0]):
        pred = 1 if y_pred[i][0] >= 0.5 else 0
        if pred == y[i]:
            count += 1
    return count / y.shape[0]
    
def train(theta, X, y, iterations=1000):
    cost_value_record = []
    accuracy_record = []
    
    for i in range(iterations):
        y_pred = predict(theta, X)
        cost_value = get_cost_value(y, y_pred)
        accuracy = get_accuracy(y, y_pred)
        
        cost_value_record.append(cost_value)
        accuracy_record.append(accuracy)
        
#         print(i, ': ', accuracy)
        print(i, ': ', cost_value)

        theta = update(theta, X, y, y_pred)
    
    return theta, cost_value_record, accuracy_record

In [206]:
init_theta = np.random.randn(feature_amount+1, 1)

# print(init_theta)

theta, cost_values, accs = train(init_theta, X_train, y_train, 1000)
# print(cost_values[-1], accs[-1])
y_pred = predict(theta, X_test)
print(y_pred)
count = 0
for i in range(y_pred.shape[0]):
    pred = 1 if y_pred[i][0] >= 0.5 else 0
    if pred == y_test[i]:
        count += 1
print(count, y_pred.shape[0])

0 :  0.38944723618090454
1 :  0.40703517587939697
2 :  0.43467336683417085
3 :  0.4547738693467337
4 :  0.46733668341708545
5 :  0.48743718592964824
6 :  0.5
7 :  0.5150753768844221
8 :  0.5276381909547738
9 :  0.550251256281407
10 :  0.5527638190954773
11 :  0.5778894472361809
12 :  0.5879396984924623
13 :  0.5904522613065326
14 :  0.6055276381909548
15 :  0.6180904522613065
16 :  0.628140703517588
17 :  0.6381909547738693
18 :  0.6482412060301508
19 :  0.6557788944723618
20 :  0.6608040201005025
21 :  0.6633165829145728
22 :  0.6708542713567839
23 :  0.678391959798995
24 :  0.6884422110552764
25 :  0.6984924623115578
26 :  0.7035175879396985
27 :  0.7035175879396985
28 :  0.7135678391959799
29 :  0.7211055276381909
30 :  0.7286432160804021
31 :  0.7336683417085427
32 :  0.7412060301507538
33 :  0.7512562814070352
34 :  0.7512562814070352
35 :  0.7537688442211056
36 :  0.7562814070351759
37 :  0.7587939698492462
38 :  0.7587939698492462
39 :  0.7613065326633166
40 :  0.766331658291457

335 :  0.957286432160804
336 :  0.957286432160804
337 :  0.957286432160804
338 :  0.957286432160804
339 :  0.957286432160804
340 :  0.957286432160804
341 :  0.957286432160804
342 :  0.957286432160804
343 :  0.957286432160804
344 :  0.957286432160804
345 :  0.957286432160804
346 :  0.957286432160804
347 :  0.957286432160804
348 :  0.957286432160804
349 :  0.957286432160804
350 :  0.957286432160804
351 :  0.957286432160804
352 :  0.957286432160804
353 :  0.957286432160804
354 :  0.957286432160804
355 :  0.957286432160804
356 :  0.957286432160804
357 :  0.957286432160804
358 :  0.9597989949748744
359 :  0.9597989949748744
360 :  0.9597989949748744
361 :  0.9597989949748744
362 :  0.9597989949748744
363 :  0.9597989949748744
364 :  0.9597989949748744
365 :  0.9597989949748744
366 :  0.9597989949748744
367 :  0.9597989949748744
368 :  0.9597989949748744
369 :  0.9597989949748744
370 :  0.9597989949748744
371 :  0.9597989949748744
372 :  0.9597989949748744
373 :  0.9597989949748744
374 :  0.

674 :  0.9698492462311558
675 :  0.9698492462311558
676 :  0.9698492462311558
677 :  0.9698492462311558
678 :  0.9698492462311558
679 :  0.9698492462311558
680 :  0.9698492462311558
681 :  0.9698492462311558
682 :  0.9698492462311558
683 :  0.9698492462311558
684 :  0.9698492462311558
685 :  0.9698492462311558
686 :  0.9698492462311558
687 :  0.9698492462311558
688 :  0.9698492462311558
689 :  0.9698492462311558
690 :  0.9698492462311558
691 :  0.9698492462311558
692 :  0.9698492462311558
693 :  0.9698492462311558
694 :  0.9698492462311558
695 :  0.9698492462311558
696 :  0.9698492462311558
697 :  0.9698492462311558
698 :  0.9698492462311558
699 :  0.9698492462311558
700 :  0.9698492462311558
701 :  0.9698492462311558
702 :  0.9698492462311558
703 :  0.9698492462311558
704 :  0.9698492462311558
705 :  0.9698492462311558
706 :  0.9698492462311558
707 :  0.9698492462311558
708 :  0.9698492462311558
709 :  0.9698492462311558
710 :  0.9698492462311558
711 :  0.9698492462311558
712 :  0.969

989 :  0.9748743718592965
990 :  0.9748743718592965
991 :  0.9748743718592965
992 :  0.9748743718592965
993 :  0.9748743718592965
994 :  0.9748743718592965
995 :  0.9748743718592965
996 :  0.9748743718592965
997 :  0.9748743718592965
998 :  0.9748743718592965
999 :  0.9748743718592965
[[6.26366663e-01]
 [2.27524542e-01]
 [2.82970600e-02]
 [3.41540315e-05]
 [9.94294803e-01]
 [9.48409466e-01]
 [1.27153254e-03]
 [9.97801193e-01]
 [9.98613473e-01]
 [9.99916159e-01]
 [9.96186309e-01]
 [9.37879790e-01]
 [1.38304829e-01]
 [9.99823955e-01]
 [5.26387741e-06]
 [9.85069301e-01]
 [4.29778464e-06]
 [9.99999879e-01]
 [9.93587131e-01]
 [9.30352753e-01]
 [9.99885851e-01]
 [9.97367478e-01]
 [9.98242033e-01]
 [8.35972761e-06]
 [5.36578892e-03]
 [2.15278616e-02]
 [9.98710183e-01]
 [3.82254219e-05]
 [9.95885018e-01]
 [9.99935450e-01]
 [1.19856298e-06]
 [9.99989617e-01]
 [9.99911497e-01]
 [9.96853707e-01]
 [7.38065689e-06]
 [3.50074029e-05]
 [7.40860056e-05]
 [9.78216920e-01]
 [9.99937959e-01]
 [7.74734296