In [722]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import datasets, model_selection, preprocessing, decomposition
import math

In [723]:
def sigmoid(z):
    s = 1 / (1 + np.exp(-z))
    return s

In [932]:
heart_df = pd.read_csv('heart.csv')

features_to_norm = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

cp = pd.get_dummies(heart_df['cp'], prefix="cp")
slope = pd.get_dummies(heart_df['slope'], prefix="slope")
thal = pd.get_dummies(heart_df['thal'], prefix="thal")
heart_df = heart_df.drop(columns=['cp', 'slope', 'thal'])

heart_df_p1 = heart_df[features_to_norm]
heart_df_p2 = heart_df.drop(columns=features_to_norm)

heart_X_p1 = heart_df_p1.to_numpy()
heart_X_p1_recip = heart_X_p1.copy()

# ss_ = preprocessing.StandardScaler()
# heart_X_p1_recip = ss_.fit_transform(heart_X_p1_recip)

m, n = heart_X_p1_recip.shape
for i in range(m):
    for j in range(n):
        value = heart_X_p1_recip[i][j]
        heart_X_p1_recip[i][j] = (1.0 / value) if value != 0 else 0.0
#         heart_X_p1_recip[i][j] = sigmoid(value)

# print(heart_X_p1[0])
# print(heart_X_p1_recip[0])
# heart_X_p1 = np.hstack((heart_X_p1, np.power(heart_X_p1, 2)))
# heart_X_p1 = np.hstack((heart_X_p1, np.power(heart_X_p1, 2), heart_X_p1_recip))
heart_X_p1 = np.hstack((heart_X_p1, heart_X_p1_recip))
# heart_X_p1 = np.hstack((heart_X_p1, np.power(heart_X_p1, 2), heart_X_p1_recip, np.power(heart_X_p1_recip, 2)))
# heart_X_p1 = np.hstack((heart_X_p1, np.power(heart_X_p1, 2), np.power(heart_X_p1, 3), heart_X_p1_recip))
# print(heart_X_p1_recip[0])

ss = preprocessing.StandardScaler()
heart_X_p1 = ss.fit_transform(heart_X_p1)

heart_df_p2 = pd.concat([cp, slope, thal, heart_df_p2], axis=1)
heart_X_p2 = heart_df_p2.to_numpy()[:, 0:-1]
heart_y = heart_df_p2.to_numpy()[:, -1]

heart_X = np.hstack((heart_X_p1, heart_X_p2))

PCA = decomposition.PCA(10)
PCA_heart_X = PCA.fit_transform(heart_X)

# print(PCA_heart_X[0])
# print(heart_X[0])


X = PCA_heart_X
y = heart_y

PCA_X = PCA_heart_X
normal_X = heart_X 

In [963]:
def perform_PCA(X_before, num):
    PCA = decomposition.PCA(num)
    return PCA.fit_transform(X_before)

In [933]:
feature_amount = X.shape[1]

def split(X, y, test_size=0.3):
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size)

    train_ones = np.ones((X_train.shape[0], 1))
    X_train = np.hstack((X_train, train_ones))

    test_ones = np.ones((X_test.shape[0], 1))
    X_test = np.hstack((X_test, test_ones))

    y_train = y_train.reshape((y_train.shape[0], 1))
    y_test = y_test.reshape((y_test.shape[0], 1))
    
    return X_train, X_test, y_train, y_test

In [934]:
from sklearn.linear_model import LogisticRegression
def evaluate(times, X, y):    
    acc_train = []
    acc_test = []
    for i in range(times):
#         print('epoch', i)
        X_train, X_test, y_train, y_test = split(X, y)

        lr = LogisticRegression(C=1, penalty='l2', solver='newton-cg', multi_class='auto', max_iter=2000)
        lr.fit(X_train,y_train.ravel())
        
        acc_train.append(lr.score(X_train, y_train.ravel()))
        acc_test.append(lr.score(X_test, y_test.ravel()))
    
    return acc_train, acc_test

In [935]:
acc_train, acc_test = evaluate(100, X, y)
print('accuracy on training set:', np.mean(acc_train))
print('accuracy on test set:', np.mean(acc_test))
# print(acc_test)

accuracy on training set: 0.8705188679245282
accuracy on test set: 0.8426373626373627


In [936]:
def perform_data_augmentation(X, y):
    _X = X.copy()
    _y = y.copy() 
    
    for i in range(_X.shape[0]):
        for j in range(_X.shape[1]):
            value = _X[i][j]
            rdm = random.uniform(-0.05, 0.05)
            _X[i][j] = value * (1 + rdm)
            
    return _X, _y

In [947]:
def predict_prob(theta, X):
    z = X.dot(theta)
    return sigmoid(z)

def predict(y_pred_prob):
    y_pred = [([1.] if item >= 0.5 else [0.]) for item in y_pred_prob]
    return np.array(y_pred)

def get_cost_value(y, y_pred_prob):
    sample_amount = y.shape[0]
    value_sum = 0
    for i in range(sample_amount):
        value_sum += y[i] * np.log(y_pred_prob[i]) \
                     + (1 - y[i]) * np.log(1 - y_pred_prob[i])
    return (-1.0 / sample_amount) * value_sum[0] 

def get_correct_pred_num(y, y_pred):
    count = 0
    for i in range(y_pred.shape[0]):
        if y_pred[i][0] == y[i][0]:
            count += 1 
    return count

def get_accuracy(y, y_pred_prob):
    count = get_correct_pred_num(y, y_pred_prob)
    return count / y.shape[0]

def update_BGD(theta, X, y, y_pred_prob, alpha=0.002):
    sample_amount = X.shape[0]
    dtheta = X.T.dot(y - y_pred_prob) / sample_amount
    theta = theta + alpha * dtheta
    return theta 

def update_SGD(theta, X, y, y_pred_prob, alpha=0.002):
    for i in range(X.shape[0]):
        dtheta = X[i] * (y[i] - y_pred_prob[i])
        for j in range(feature_amount+1):
            theta[j][0] += alpha * dtheta[j]
    return theta

def update_Random(theta, X, y, y_pred_prob, alpha=0.002):
    all_samples = np.hstack((X, y, y_pred_prob))
    batch_size = math.ceil(X.shape[0] / 2)
    
    choice = np.random.choice(X.shape[0], size=batch_size, replace=False)
    selected_samples = all_samples[choice]
    
    _X = selected_samples[:, 0:-2]
    _y = selected_samples[:, -2].reshape((batch_size, 1))
    _y_pred_prob = selected_samples[:, -1].reshape((batch_size, 1)) 
    
    dtheta = _X.T.dot(_y - _y_pred_prob) / batch_size
    theta = theta + alpha * dtheta
    return theta 
    
    
def train(theta, X, y, iterations=1000, updatefunc='BGD'):
    cost_value_record = []
    accuracy_record = []
    theta_record = []
    update_func = update_BGD
    
    if updatefunc == 'SGD':
        update_func = update_SGD
    elif updatefunc == "Random":
        update_func = update_Random
    
    for i in range(iterations):
        y_pred_prob = predict_prob(theta, X)
        cost_value = get_cost_value(y, y_pred_prob)
        accuracy = get_accuracy(y, predict(y_pred_prob))
        # print(i, accuracy, cost_value)
        
        temp_alpha = 30 / (i + 500)
        if i >= 500:
            temp_alpha = 8 / i
        if i >= 1500: 
            temp_alpha = 0.005
        if i >= 2500:
            temp_alpha = 0.0001
            
            
#         last_acc = 0.0 
#         if i > 0 and i % 100 == 0:
#             if accuracy < last_acc and i > 2000:
#                 break
#             else:
#                 last_acc = accuracy_record


        theta = update_func(theta, X, y, y_pred_prob, temp_alpha)

        cost_value_record.append(cost_value)
        accuracy_record.append(accuracy)
        theta_record.append(theta.copy())
        
        if i % 1000 == 0:
            y_test_pred_prob = predict_prob(theta, X_test)
            y_test_pred = predict(y_test_pred_prob)
#             print('iteration', i, 'finished. Acc: ', accuracy, get_accuracy(y_test, y_test_pred))
            print('iteration', i, accuracy, get_accuracy(y_test, y_test_pred), cost_value)
    
    return theta, cost_value_record, accuracy_record, theta_record

In [948]:
# def try_train(times, X_train, X_test, y_train, y_test):
#     init_theta = np.random.randn(feature_amount+1, 1)
#     final_theta, cost_values, accs, thetas = train(init_theta.copy(), X_train, y_train, 5000)

#     y_test_pred_prob = predict_prob(final_theta, X_test)
#     y_test_pred = predict(y_test_pred_prob)
    
#     return final_theta, cost_values, accs, thetas, get_accuracy(y_test, y_test_pred)

In [965]:
def try_train(PCA_num=0, use_aug=False, updatefunc="BGD", threshold=0.87):
    X_ = normal_X
    if PCA_num != 0:
        X_ = perform_PCA(X_, PCA_num)

    X_train, X_test, y_train, y_test = split(PCA_X, y)
    lrscore = 0
    while(True):
        lr = LogisticRegression(penalty='l2', solver='newton-cg', multi_class='auto', max_iter=2000)
        lr.fit(X_train,y_train.ravel())
        if lr.score(X_test, y_test.ravel()) < threshold:
            X_train, X_test, y_train, y_test = split(X, y)
        else:
#             print(lr.score(X_test, y_test.ravel()))
            lrscore = lr.score(X_test, y_test.ravel())
            break
    if use_aug:    
        _X_train, _y_train = perform_data_augmentation(X_train, y_train)
        X_train = np.vstack((X_train, _X_train))
        y_train = np.vstack((y_train, _y_train))

    init_theta = np.random.randn(feature_amount+1, 1)
    final_theta, cost_values, accs, _ = train(init_theta.copy(), X_train, y_train, iterations=5000, updatefunc=updatefunc)
    y_test_pred_prob = predict_prob(final_theta, X_test)
    y_test_pred = predict(y_test_pred_prob)
#     print(get_accuracy(y_test, y_test_pred))
    
    return cost_values, accs, lrscore, get_accuracy(y_test, y_test_pred)

In [978]:
# PCA AUG UPDATEFUNC 
# no  no  BGD 1 
# no  no  SGD 2
# no  no  Random 3
# 5   no  BGD 4
# 8   no  BGD 5 
# 10  no  BGD 6 
# 12  no  BGD 7 
# 15  no  BGD 8  
# no  yes BGD 9     
# no  yes SGD 10 
# no  yes Random 11 
# 10  yes BGD 12
# 10  yes SGD 13  
# 10  yes Random 14

times_each_config = 5
configs = [
    [0, False, 'BGD'], # 1
    [0, False, 'SGD'],
    [0, False, 'Random'], # 3
    [5, False, 'BGD'], 
    [8, False, 'BGD'], # 5
    [10, False, 'BGD'], 
    [12, False, 'BGD'], # 7
    [15, False, 'BGD'], 
    [0, True, 'BGD'], # 9
    [0, True, 'SGD'], 
    [0, True, 'Random'], # 11
    [10, True, 'BGD'], 
    [10, True, 'SGD'], # 13
    [10, True, 'Random']
]

record = []
for config in configs:
    temp = []
    for i in range(times_each_config):
        print(config, i)
        cost_values, accs, acc_sk, acc_mine = try_train(
            PCA_num=config[0], 
            use_aug=config[1], 
            updatefunc=config[2],
            threshold=0.87 if i <= 2 else 0.9
        )
        temp.append([cost_values, accs, acc_sk, acc_mine])
        print('finished')
    record.append(temp)




[0, False, 'BGD'] 0
iteration 0 0.42452830188679247 0.46153846153846156 1.3472252840136767
iteration 1000 0.839622641509434 0.8571428571428571 0.35251441138401535
iteration 2000 0.8443396226415094 0.8571428571428571 0.3475011047252015
iteration 3000 0.8443396226415094 0.8571428571428571 0.3460000068460769
iteration 4000 0.8443396226415094 0.8571428571428571 0.34594776567586505
finished
[0, False, 'BGD'] 1
iteration 0 0.3018867924528302 0.31868131868131866 2.3723365422946796
iteration 1000 0.8584905660377359 0.8461538461538461 0.33772296167291765
iteration 2000 0.8584905660377359 0.8461538461538461 0.33405618591322733
iteration 3000 0.8584905660377359 0.8461538461538461 0.33303084137620337
iteration 4000 0.8584905660377359 0.8461538461538461 0.3329957989380093
finished
[0, False, 'BGD'] 2
iteration 0 0.49056603773584906 0.5164835164835165 1.831947136158607
iteration 1000 0.8443396226415094 0.8461538461538461 0.33329737556623823
iteration 2000 0.8443396226415094 0.8461538461538461 0.3311

  
  


iteration 1000 0.8537735849056604 0.8461538461538461 0.3237678384808283
iteration 2000 0.8537735849056604 0.8461538461538461 0.32376783848082835
iteration 3000 0.8537735849056604 0.8461538461538461 0.32376783848082813
iteration 4000 0.8537735849056604 0.8461538461538461 0.32376783848082824
finished
[0, False, 'SGD'] 4
iteration 0 0.39622641509433965 0.7362637362637363 2.0440510309888866


  
  


iteration 1000 0.8537735849056604 0.8571428571428571 0.3215923903573721
iteration 2000 0.8537735849056604 0.8571428571428571 0.32159239035737197
iteration 3000 0.8537735849056604 0.8571428571428571 0.321592390357372
iteration 4000 0.8537735849056604 0.8571428571428571 0.3215923903573721
finished
[0, False, 'Random'] 0
iteration 0 0.4528301886792453 0.3516483516483517 1.7191963484799555
iteration 1000 0.8537735849056604 0.8571428571428571 0.33144697495904013
iteration 2000 0.8537735849056604 0.8461538461538461 0.32821521121054287
iteration 3000 0.8537735849056604 0.8461538461538461 0.32729195660579236
iteration 4000 0.8537735849056604 0.8461538461538461 0.3272602636903211
finished
[0, False, 'Random'] 1
iteration 0 0.44339622641509435 0.4175824175824176 2.007428452141645
iteration 1000 0.8537735849056604 0.8351648351648352 0.3454976015688188
iteration 2000 0.8537735849056604 0.8351648351648352 0.3339203849951957
iteration 3000 0.8537735849056604 0.8461538461538461 0.3302582321003601
ite

iteration 2000 0.8490566037735849 0.8571428571428571 0.3393179383024309
iteration 3000 0.8443396226415094 0.8571428571428571 0.3390093728131529
iteration 4000 0.8443396226415094 0.8571428571428571 0.33899855547595614
finished
[12, False, 'BGD'] 1
iteration 0 0.38207547169811323 0.3956043956043956 1.8771338320136515
iteration 1000 0.8632075471698113 0.8681318681318682 0.3296799841684371
iteration 2000 0.8679245283018868 0.8791208791208791 0.3221839795032902
iteration 3000 0.8679245283018868 0.8681318681318682 0.31997895886059396
iteration 4000 0.8679245283018868 0.8681318681318682 0.31990250953211213
finished
[12, False, 'BGD'] 2
iteration 0 0.49056603773584906 0.5384615384615384 1.7789507350639162
iteration 1000 0.8443396226415094 0.8461538461538461 0.3466335452502537
iteration 2000 0.8443396226415094 0.8461538461538461 0.3401458798715321
iteration 3000 0.8443396226415094 0.8461538461538461 0.33808863712196735
iteration 4000 0.8443396226415094 0.8461538461538461 0.33801590135035986
fin

  
  


iteration 1000 0.8372641509433962 0.8571428571428571 0.36572425077283033
iteration 2000 0.8372641509433962 0.8571428571428571 0.36572425077283005
iteration 3000 0.8372641509433962 0.8571428571428571 0.36572425077283005
iteration 4000 0.8372641509433962 0.8571428571428571 0.36572425077283033
finished
[0, True, 'SGD'] 1
iteration 0 0.47877358490566035 0.7692307692307693 1.6485730197108295


  
  


iteration 1000 0.8584905660377359 0.8681318681318682 0.34541607456755613
iteration 2000 0.8584905660377359 0.8681318681318682 0.34541607456755624
iteration 3000 0.8584905660377359 0.8681318681318682 0.3454160745675562
iteration 4000 0.8584905660377359 0.8681318681318682 0.3454160745675563
finished
[0, True, 'SGD'] 2
iteration 0 0.7264150943396226 0.4835164835164835 0.7351360265070106


  
  


iteration 1000 0.8514150943396226 0.8571428571428571 0.33467679622548413
iteration 2000 0.8514150943396226 0.8571428571428571 0.33467679622548435
iteration 3000 0.8514150943396226 0.8571428571428571 0.3346767962254847
iteration 4000 0.8514150943396226 0.8571428571428571 0.33467679622548435
finished
[0, True, 'SGD'] 3
iteration 0 0.6650943396226415 0.7472527472527473 1.20924929140611


  
  


iteration 1000 0.8466981132075472 0.8461538461538461 0.315704736581769
iteration 2000 0.8466981132075472 0.8461538461538461 0.31570473658176895
iteration 3000 0.8466981132075472 0.8461538461538461 0.31570473658176895
iteration 4000 0.8466981132075472 0.8461538461538461 0.31570473658176884
finished
[0, True, 'SGD'] 4
iteration 0 0.5731132075471698 0.7472527472527473 1.8492050623869205


  
  


iteration 1000 0.8466981132075472 0.8351648351648352 0.3469044108800236
iteration 2000 0.8466981132075472 0.8351648351648352 0.3469044108800239
iteration 3000 0.8466981132075472 0.8351648351648352 0.3469044108800238
iteration 4000 0.8466981132075472 0.8351648351648352 0.34690441088002394
finished
[0, True, 'Random'] 0
iteration 0 0.5306603773584906 0.6043956043956044 1.5136776811356438
iteration 1000 0.8466981132075472 0.8571428571428571 0.3399831918042859
iteration 2000 0.8537735849056604 0.8681318681318682 0.33447182132384556
iteration 3000 0.8561320754716981 0.8681318681318682 0.33288809429325544
iteration 4000 0.8561320754716981 0.8681318681318682 0.33283321323317927
finished
[0, True, 'Random'] 1
iteration 0 0.6556603773584906 0.6043956043956044 0.8693940085705906
iteration 1000 0.8514150943396226 0.8461538461538461 0.3513214259587356
iteration 2000 0.8608490566037735 0.8571428571428571 0.3469586490688126
iteration 3000 0.8679245283018868 0.8681318681318682 0.3456003230856932
iter

  
  


iteration 1000 0.8679245283018868 0.8791208791208791 0.3234750609104343
iteration 2000 0.8679245283018868 0.8791208791208791 0.3234750609104344
iteration 3000 0.8679245283018868 0.8791208791208791 0.32347506091043415
iteration 4000 0.8679245283018868 0.8791208791208791 0.3234750609104342
finished
[10, True, 'SGD'] 1
iteration 0 0.3231132075471698 0.7472527472527473 2.497066870942503


  
  


iteration 1000 0.8655660377358491 0.8681318681318682 0.33002211309029805
iteration 2000 0.8655660377358491 0.8681318681318682 0.3300221130902979
iteration 3000 0.8655660377358491 0.8681318681318682 0.33002211309029744
iteration 4000 0.8655660377358491 0.8681318681318682 0.3300221130902981
finished
[10, True, 'SGD'] 2
iteration 0 0.47641509433962265 0.8241758241758241 2.145250542746498


  
  


iteration 1000 0.8419811320754716 0.8571428571428571 0.33253124934705103
iteration 2000 0.8419811320754716 0.8571428571428571 0.33253124934705075
iteration 3000 0.8419811320754716 0.8571428571428571 0.3325312493470512
iteration 4000 0.8419811320754716 0.8571428571428571 0.33253124934705114
finished
[10, True, 'SGD'] 3
iteration 0 0.31839622641509435 0.7692307692307693 3.2498163062793615


  
  


iteration 1000 0.8419811320754716 0.8571428571428571 0.35186114118964823
iteration 2000 0.8419811320754716 0.8571428571428571 0.3518611411896483
iteration 3000 0.8419811320754716 0.8571428571428571 0.3518611411896487
iteration 4000 0.8419811320754716 0.8571428571428571 0.3518611411896483
finished
[10, True, 'SGD'] 4
iteration 0 0.4363207547169811 0.7142857142857143 1.9762954799432104


  
  


iteration 1000 0.8514150943396226 0.8571428571428571 0.32316024143089
iteration 2000 0.8514150943396226 0.8571428571428571 0.32316024143089
iteration 3000 0.8514150943396226 0.8571428571428571 0.32316024143089006
iteration 4000 0.8514150943396226 0.8571428571428571 0.32316024143088995
finished
[10, True, 'Random'] 0
iteration 0 0.5070754716981132 0.5054945054945055 1.323522792911949
iteration 1000 0.8490566037735849 0.8571428571428571 0.36170912092634616
iteration 2000 0.8514150943396226 0.8681318681318682 0.34899044728132916
iteration 3000 0.8419811320754716 0.8571428571428571 0.3455083554053961
iteration 4000 0.8419811320754716 0.8571428571428571 0.34538817341129596
finished
[10, True, 'Random'] 1
iteration 0 0.6415094339622641 0.6153846153846154 1.1210854801391346
iteration 1000 0.8419811320754716 0.8241758241758241 0.36359048411035966
iteration 2000 0.8490566037735849 0.8241758241758241 0.3489470978963044
iteration 3000 0.8443396226415094 0.8241758241758241 0.3448377147322859
itera

In [987]:
anorec = record.copy()
anorec_np = np.array(anorec)
print(anorec_np.shape)
np.save('recordLR.npy', anorec_np)

(14, 5, 4)


In [991]:

for i in range(14):
    print(configs[i])
    for j in range(5):
        attemp = anorec_np[i][j]
        print(attemp[2], attemp[3], '!' if attemp[3] > attemp[2] else '')

[0, False, 'BGD']
0.8901098901098901 0.9010989010989011 !
0.8901098901098901 0.8791208791208791 
0.8901098901098901 0.8901098901098901 
0.9120879120879121 0.9010989010989011 
0.945054945054945 0.9340659340659341 
[0, False, 'SGD']
0.9010989010989011 0.8901098901098901 
0.8901098901098901 0.8791208791208791 
0.8901098901098901 0.8901098901098901 
0.9010989010989011 0.8901098901098901 
0.9010989010989011 0.9010989010989011 
[0, False, 'Random']
0.9010989010989011 0.8901098901098901 
0.8791208791208791 0.8901098901098901 !
0.8791208791208791 0.8571428571428571 
0.9010989010989011 0.8791208791208791 
0.9010989010989011 0.9010989010989011 
[5, False, 'BGD']
0.8901098901098901 0.8791208791208791 
0.9010989010989011 0.9010989010989011 
0.9010989010989011 0.8901098901098901 
0.9010989010989011 0.8681318681318682 
0.9340659340659341 0.9340659340659341 
[8, False, 'BGD']
0.8791208791208791 0.8791208791208791 
0.9120879120879121 0.9010989010989011 
0.9230769230769231 0.9010989010989011 
0.9230769