In [189]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import datasets, model_selection, preprocessing

In [190]:
dataset = datasets.load_breast_cancer()
# dataset = datasets.load_iris()

data = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)

data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [191]:
# label all items in the dataset

# print(dataset.target_names)
# print(dataset.target)
data['type'] = [dataset.target_names[idx] for idx in dataset.target]
data.head() # the last column "type" was added

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [192]:
X = dataset.data
y = dataset.target 

# data = np.loadtxt('data.csv')
# X = data[:, 0:-1]
# y = data[:, -1]

feature_amount = X.shape[1]

# X_std = X.std(axis=0)
# X_mean = X.mean(axis=0)
# X_norm = (X - X_mean) / X_std 

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3)
sc = preprocessing.StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

train_ones = np.ones((X_train.shape[0], 1))
X_train = np.hstack((X_train, train_ones))

test_ones = np.ones((X_test.shape[0], 1))
X_test = np.hstack((X_test, test_ones))
print(X_train.shape)
# print(y_train.shape)
print(X_test.shape)
# print(y_test.shape)
new_y_train = []
for each in y_train:
    new_y_train.append([each])
y_train = np.array(new_y_train)

(398, 31)
(171, 31)


In [209]:
def sigmoid(z):
    s = 1 / (1 + np.exp(-z))
    return s

def predict(theta, X):
    z = X.dot(theta)
    return sigmoid(z)

def get_cost_value(y, y_pred):
    sample_amount = y.shape[0]
    value_sum = 0
    for i in range(sample_amount):
        value_sum += y[i] * np.log(y_pred[i]) + (1 - y[i]) * np.log(1 - y_pred[i])
    return (-1.0 / sample_amount) * value_sum[0] 

def update(theta, X, y, y_pred, alpha=0.05):
    sample_amount = X.shape[0]
    dtheta = X.T.dot(y - y_pred) / sample_amount
    return theta + alpha * dtheta

def get_accuracy(y, y_pred):
    count = 0
    for i in range(y_pred.shape[0]):
        pred = 1 if y_pred[i][0] >= 0.5 else 0
        if pred == y[i]:
            count += 1
    return count / y.shape[0]
    
def train(theta, X, y, iterations=1000):
    cost_value_record = []
    accuracy_record = []
    
    for i in range(iterations):
        y_pred = predict(theta, X)
        cost_value = get_cost_value(y, y_pred)
        accuracy = get_accuracy(y, y_pred)
        
        cost_value_record.append(cost_value)
        accuracy_record.append(accuracy)
        
#         print(i, ': ', accuracy)
        print(i, ': ', cost_value)

        theta = update(theta, X, y, y_pred)
    
    return theta, cost_value_record, accuracy_record

In [None]:
init_theta = np.random.randn(feature_amount+1, 1)

# print(init_theta)

theta, cost_values, accs = train(init_theta, X_train, y_train, 1000)
# print(cost_values[-1], accs[-1])
y_pred = predict(theta, X_test)
print(y_pred)
count = 0
for i in range(y_pred.shape[0]):
    pred = 1 if y_pred[i][0] >= 0.5 else 0
    if pred == y_test[i]:
        count += 1
print(count, y_pred.shape[0])

0 :  0.7019979365092845
1 :  0.6765599433271566
2 :  0.6527858932677358
3 :  0.6305200173094683
4 :  0.6096278239390189
5 :  0.5899928785196408
6 :  0.5715144706841485
7 :  0.5541060053600659
8 :  0.5376939957780652
9 :  0.522217467704971
10 :  0.5076272544753361
11 :  0.49388454547243565
12 :  0.4809578346752581
13 :  0.46881794369269597
14 :  0.4574319742103871
15 :  0.446758456194257
16 :  0.4367460514980566
17 :  0.42733633017526423
18 :  0.4184688461765689
19 :  0.4100860076178247
20 :  0.4021362127649561
21 :  0.3945750631113217
22 :  0.38736521109660793
23 :  0.3804755046894296
24 :  0.37387990714956126
25 :  0.3675564536291844
26 :  0.3614863544331818
27 :  0.3556532709601696
28 :  0.3500427528587166
29 :  0.34464181166369
30 :  0.33943860476138626
31 :  0.3344222059629688
32 :  0.32958244269025444
33 :  0.3249097831400089
34 :  0.3203952599194609
35 :  0.3160304192043068
36 :  0.3118072868344426
37 :  0.30771834478700594
38 :  0.30375651328407866
39 :  0.2999151352801863
40 : 