In [58]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import datasets, model_selection, preprocessing

In [59]:
X_y_dict = dict()

In [60]:
cancer_dataset = datasets.load_breast_cancer()
# cancer_data = pd.DataFrame(data=cancer_dataset.data, columns=cancer_dataset.feature_names)

# label all items in the dataset
# cancer_data['type'] = [cancer_dataset.target_names[idx] for idx in cancer_dataset.target]
# cancer_data.head() # the last column "type" was added

cancer_X = cancer_dataset.data
cancer_y = cancer_dataset.target

X_y_dict['cancer'] = [cancer_X, cancer_y]

In [61]:
iris_dataset = datasets.load_iris()
# iris_data = pd.DataFrame(data=iris_dataset.data, columns=iris_dataset.feature_names)

# label all items in the dataset
# iris_data['type'] = [iris_dataset.target_names[idx] for idx in iris_dataset.target]
# iris_data.head() # the last column "type" was added

iris_X = iris_dataset.data 
iris_y = iris_dataset.target

X_y_dict['iris'] = [iris_X, iris_y]

In [62]:
testdata = np.loadtxt('testdata.csv')
testdata_X = testdata[:, 0:-1]
testdata_y = testdata[:, -1]

X_y_dict['testdata'] = [testdata_X, testdata_y]

In [63]:
heart_df = pd.read_csv('heart.csv')
heart_X = df.to_numpy()[:, 0:-1]
heart_y = df.to_numpy()[:, -1]

X_y_dict['heart'] = [heart_X, heart_y]

In [64]:
[X, y] = X_y_dict['heart']


feature_amount = X.shape[1]

# X_std = X.std(axis=0)
# X_mean = X.mean(axis=0)
# X_norm = (X - X_mean) / X_std 

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3)

sc = preprocessing.StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

train_ones = np.ones((X_train.shape[0], 1))
X_train = np.hstack((X_train, train_ones))

test_ones = np.ones((X_test.shape[0], 1))
X_test = np.hstack((X_test, test_ones))

y_train = y_train.reshape((y_train.shape[0], 1))
y_test = y_test.reshape((y_test.shape[0], 1))

# print(X_train.shape)
# print(y_train.shape)
# print(X_test.shape)
# print(y_test.shape)

In [65]:
def sigmoid(z):
    s = 1 / (1 + np.exp(-z))
    return s

def predict(theta, X):
    z = X.dot(theta)
    return sigmoid(z)

def get_cost_value(y, y_pred):
    sample_amount = y.shape[0]
    value_sum = 0
    for i in range(sample_amount):
        value_sum += y[i] * np.log(y_pred[i]) + (1 - y[i]) * np.log(1 - y_pred[i])
    return (-1.0 / sample_amount) * value_sum[0] 

def update(theta, X, y, y_pred, alpha=0.05):
    sample_amount = X.shape[0]
    dtheta = X.T.dot(y - y_pred) / sample_amount
    return theta + alpha * dtheta

def get_accuracy(y, y_pred):
    count = 0
    for i in range(y_pred.shape[0]):
        pred = 1 if y_pred[i][0] >= 0.5 else 0
        if pred == y[i]:
            count += 1
    return count / y.shape[0]
    
def train(theta, X, y, iterations=1000):
    cost_value_record = []
    accuracy_record = []
    
    for i in range(iterations):
        y_pred = predict(theta, X)
        cost_value = get_cost_value(y, y_pred)
        accuracy = get_accuracy(y, y_pred)
        
        cost_value_record.append(cost_value)
        accuracy_record.append(accuracy)
        
#         print(i, accuracy, cost_value)

        theta = update(theta, X, y, y_pred)
    
    return theta, cost_value_record, accuracy_record

In [66]:
init_theta = np.random.randn(feature_amount+1, 1)

# print(init_theta)

theta, cost_values, accs = train(init_theta, X_train, y_train, 1000)
# print(cost_values[-1], accs[-1])
y_pred = predict(theta, X_test)
# print(y_pred)
count = 0
for i in range(y_pred.shape[0]):
    pred = 1 if y_pred[i][0] >= 0.5 else 0
    if pred == y_test[i][0]:
        count += 1
print(count, y_pred.shape[0], count/y_pred.shape[0])

70 91 0.7692307692307693


In [67]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', solver='saga', multi_class='auto')
lr.fit(X_train,y_train.ravel())



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [68]:
print("Logistic Regression模型训练集的准确率：%.3f" %lr.score(X_train, y_train.ravel()))
print("Logistic Regression模型测试集的准确率：%.3f" %lr.score(X_test, y_test.ravel()))
from sklearn import metrics
y_hat = lr.predict(X_test)
accuracy = metrics.accuracy_score(y_test.ravel(), y_hat) #错误率，也就是np.average(y_test==y_pred)
print("Logistic Regression模型正确率：%.3f" %accuracy)

Logistic Regression模型训练集的准确率：0.877
Logistic Regression模型测试集的准确率：0.780
Logistic Regression模型正确率：0.780
