In [336]:
#Predict whether a patient has heart disease or not!

#Load data from excel spreadsheet
import csv
spreadsheet = open('heart_disease_dataset.csv', encoding = 'utf-8')
csv_data = csv.reader(spreadsheet)
spreadsheet_list = list(csv_data)
cv_set = spreadsheet_list[200:250]
test_set = spreadsheet_list[250:]
spreadsheet_list = spreadsheet_list[:200]
spreadsheet.close()

In [337]:
#Create the sigmoid function
import math
import numpy as np
def sigmoid(z):
    return (1/(1+np.exp(-z)))

In [338]:
#Add x0 = 1 to the dataset
import numpy as np

#remove the header titles
spreadsheet_list = spreadsheet_list[1:]

#Add x0 to array
row_num = len(spreadsheet_list)
x0 = np.ones((row_num, 1))
spreadsheet_list = list(np.append(x0, np.array(spreadsheet_list), axis=1))

In [339]:
#Convert str to float for each spreadsheet number
new_spr_list = []
for row in spreadsheet_list:
    add_list = []
    for col in row:
        add_list.append(float(col))
    new_spr_list.append(add_list)

In [340]:
#Remove y from the spreadsheet data
final_spr_list = []
for row in new_spr_list:
    final_spr_list.append(row[:-1])

In [341]:
#Mean normalisation and feature scaling
from statistics import *

#Useful variables
rows_num = len(final_spr_list)
col_num = len(final_spr_list[0])

#Remove x0 first
final_spr_list2 = []
for row in new_spr_list:
    final_spr_list2.append(row[1:])
    
#calculating mean
def cal_mean(spreadsheet_data, col, rows):
    'Returns row vector of mean for each column, size 1xm, takes in spreadsheet data, col, row'
    initial_sum = np.array([0.0]*col)
    for row in spreadsheet_data:
        initial_sum += np.array(row)
    col_mean = initial_sum/rows
    return col_mean

col_mean = cal_mean(final_spr_list2, col_num, rows_num)

#calculating stdev
def stdevlist(spreadsheet_data, col, rows):
    'Returns 1xm row vector of the stdev for each column, takes in spreadsheet data, col, row'
    spreadsheet_data = np.array(spreadsheet_data)
    stdev_list = np.array([0.0]*col)
    for i in range(col):
        stdev_list[i] = stdev(spreadsheet_data[:,i])
    return stdev_list
        
stdev_list = stdevlist(final_spr_list2, col_num, rows_num)
 
#feature scaling
def feature_scaling(stdeviation, mean, data):
    'Takes in mean and stdeviation row vector 1xm and data, returns feature scaled numpy matrix'
    feature_scaled = np.zeros((rows_num, col_num))
    for row in range(rows_num):
        for col in range(col_num):
            feature_scaled[row, col] = (data[row, col] - mean[col]) / stdev_list[col]
    return np.array(feature_scaled)
            
#fs_list is feature scaled and mean normalised numpy array of data
fs_list = feature_scaling(stdev_list, col_mean, np.array(final_spr_list2))

#add back x0
x0 = np.ones((row_num, 1))
fs_list = np.append(x0, fs_list, axis=1)
print(fs_list)

[[ 1.          0.93704365  0.6397503  ... -0.71110168  0.5921623
  -0.90638782]
 [ 1.          1.39047024  0.6397503  ...  2.36518603 -0.93823118
   1.09773636]
 [ 1.          1.39047024  0.6397503  ...  1.33975679  1.10229347
   1.09773636]
 ...
 [ 1.          0.48361706  0.6397503  ... -0.71110168 -0.93823118
   1.09773636]
 [ 1.         -0.53659278 -1.55525504 ... -0.71110168 -0.93823118
  -0.90638782]
 [ 1.          1.0504003  -1.55525504 ... -0.71110168 -0.93823118
  -0.90638782]]


In [342]:
# initialise weight vectors
rows_num = len(fs_list)
col_num = len(fs_list[0])

theta = np.zeros((col_num,1))

In [343]:
# extracting y from the data (as numpy array)
new_spr_list = np.array(new_spr_list)
y_vector = new_spr_list[:,-1]
y_vector = np.reshape(y_vector, (rows_num, 1))

In [344]:
# Visualising data

import matplotlib.pyplot as plt
from ipywidgets import interact

#Create separate numpy arrays for y=0 data and y=1 data
def separate_data(numpy_data, y, col_num):
    'returns 2 vectors for y = 0 data and y = 1 data, accepts numpy array'
    y0_data = np.empty((0, col_num))
    y1_data = np.empty((0, col_num))
    for i in range(rows_num):
        if int(y[i]) == 1:
            y1_data = np.append(y1_data, np.array([numpy_data[i,:]]), axis = 0)
        elif int(y[i]) == 0:
            y0_data = np.append(y0_data, np.array([numpy_data[i,:]]), axis = 0)
        else:
            pass
    return y0_data, y1_data
        
y0_data, y1_data = separate_data(fs_list, y_vector, col_num)

def visualise_data(feature1 = 1, feature2 = 5):
    'plots feature1 against feature2 as scatterplot, takes in numpy arrays'
    feature1_plots0 = y0_data[:,int(feature1)]
    feature2_plots0 = y0_data[:,int(feature2)]
    
    feature1_plots1 = y1_data[:,int(feature1)]
    feature2_plots1 = y1_data[:,int(feature2)]
    
    fig = plt.figure()
    ax = fig.add_subplot()
    ax.scatter(feature1_plots0, feature2_plots0, color='r')
    ax.scatter(feature1_plots1, feature2_plots1, color='b')
    ax.set_xlabel(f'Feature {feature1}')
    ax.set_ylabel(f'Feature {feature2}')
    ax.set_title(f'Feature {feature1} against Feature {feature2}')
    plt.xlim(-5, 5)
    plt.ylim(-5, 5)
    plt.show()

#visualise_data(y0_data, y1_data, 1, 5)
interact(visualise_data, feature1 =(1, 13, 1), feature2=(1, 13, 1));

interactive(children=(IntSlider(value=1, description='feature1', max=13, min=1), IntSlider(value=5, descriptio…

In [345]:
# cost function

def init_weights():
    rows_num = len(fs_list)
    col_num = len(fs_list[0])
    theta = np.zeros((col_num,1))
    return theta

def cost_function(data, theta, y, lam = 0.03):
    'calculates the cost function for a given theta weight matrix'
    hypothesis = sigmoid(data @ theta)
    m = rows_num
    sumofweights = np.transpose(theta) @ theta
    sumofweights = sumofweights[0]
    cost = (1/m) * (-np.transpose(y) @ np.log(hypothesis) - np.transpose((1 - y))@ np.log(1-hypothesis)) + (lam/m) * sumofweights
    return cost[0][0]

def gradient_descent(data, theta, y, alpha, lam = 0.03):
    hypothesis = sigmoid(data @ theta)
    m = rows_num
    theta_temp = theta
    theta_temp[0] = 0
    theta -= (alpha/m) * np.transpose(data) @ (hypothesis - y) + (lam/m) * theta_temp
    return theta

def iteration(number_it = 300, alpha = 0.01, lam = 0.03):
    theta = init_weights()
    cost_list = []
    for i in range(number_it):
        cost = cost_function(fs_list, theta, y_vector, lam)
        theta = gradient_descent(fs_list, theta, y_vector, alpha, lam)
        cost_list.append(cost)
    
    x = np.arange(1, number_it + 1, 1)
    plt.plot(x, np.array(cost_list))
    plt.ylabel('Cost')
    plt.xlabel('Number of iterations')
    plt.title('Cost against iteration')
    plt.show()

#interactive plot of iteration number, learning rate, and cost over iteration
interact(iteration, number_it =(1, 1000, 50), alpha=(0, 1, 0.01), lam = (0, 1, 0.01));
    

interactive(children=(IntSlider(value=300, description='number_it', max=1000, min=1, step=50), FloatSlider(val…

In [346]:
# cost over time
def iteration(number_it = 900, alpha = 0.10, lam = 0):
    theta = init_weights()
    for i in range(number_it):
        cost = cost_function(fs_list, theta, y_vector)
        theta = gradient_descent(fs_list, theta, y_vector, alpha)
    return theta

theta_values = iteration()
print(theta_values)

[[-5.61325367e-04]
 [ 5.59754460e-02]
 [ 3.19850423e-01]
 [ 4.62105636e-01]
 [ 2.17014826e-01]
 [ 1.58051158e-01]
 [-1.43509889e-01]
 [ 2.35989918e-01]
 [-2.79025816e-01]
 [ 2.35654903e-01]
 [ 2.49958920e-01]
 [ 1.50500510e-01]
 [ 4.76819535e-01]
 [ 3.55587609e-01]
 [ 3.65383881e+00]]


In [347]:
# Checking against cross validation set

#Add x0 to array
row_num = len(cv_set)
x0 = np.ones((row_num, 1))
cv_set = list(np.append(x0, np.array(cv_set), axis=1))

#Convert str to float for each spreadsheet number
cv_new_list = []
for row in cv_set:
    add_list = []
    for col in row:
        add_list.append(float(col))
    cv_new_list.append(add_list)

#Remove y from the spreadsheet data
final_cv_list = []
y_cv_values = []
for row in cv_new_list:
    final_cv_list.append(row[:-1])
    y_cv_values.append(row[-1])

#Useful variables
rows_num = len(final_cv_list)
col_num = len(final_cv_list[0])

#feature scaling and mean normalisation on cv set
#Remove x0 first
final_cv_list2 = []
for row in cv_new_list:
    final_cv_list2.append(row[1:])

#perform feature scaling and mean normalisation
cv_fs_list = feature_scaling(stdev_list, col_mean, np.array(final_cv_list2))

#add back x0
x0 = np.ones((row_num, 1))
cv_fs_list = np.append(x0, cv_fs_list, axis=1)

def cv_check(data, theta, y):
    'calculates the cost function for a given theta weight matrix'
    hypothesis = sigmoid(data @ theta)
    m = rows_num
    lam = 0
    sumofweights = np.transpose(theta) @ theta
    sumofweights = sumofweights[0]
    cost = (1/m) * (-np.transpose(y) @ np.log(hypothesis) - np.transpose((1 - y))@ np.log(1-hypothesis))+ (lam/m) * sumofweights
    return cost[0]

cv_cost = cv_check(np.array(cv_fs_list), np.array(theta_values), np.array(y_cv_values))
print(cv_cost)

0.01265467078971175


In [348]:
# Checking against test data set

#Add x0 to array
row_num = len(test_set)
x0 = np.ones((row_num, 1))
test_set = list(np.append(x0, np.array(test_set), axis=1))

#Convert str to float for each spreadsheet number
test_new_list = []
for row in test_set:
    add_list = []
    for col in row:
        add_list.append(float(col))
    test_new_list.append(add_list)

#Remove y from the spreadsheet data
final_set_list = []
y_set_values = []
for row in test_new_list:
    final_set_list.append(row[:-1])
    y_set_values.append(row[-1])

#Useful variables
rows_num = len(final_set_list)
col_num = len(final_set_list[0])

#feature scaling and mean normalisation on cv set
#Remove x0 first
final_set_list2 = []
for row in test_new_list:
    final_set_list2.append(row[1:])

#perform feature scaling and mean normalisation
test_fs_list = feature_scaling(stdev_list, col_mean, np.array(final_set_list2))

#add back x0
x0 = np.ones((row_num, 1))
test_fs_list = np.append(x0, test_fs_list, axis=1)

def prediction(theta, data, value):
    global right
    right = 0
    global wrong
    wrong = 0
    hypothesis = sigmoid(data @ theta)
    if hypothesis >= 0.5:
        print(f'Prediction: 1, probability: {hypothesis}')
        print(f'Actual value: {y_set_values[value]}')
        right +=1
    else:
        print(f'Prediction: 0, probability: {hypothesis}')
        print(f'Actual value: {y_set_values[value]}')
        wrong +=1

for value in range(len(test_fs_list)):
    prediction(np.array(theta_values), test_fs_list[value], value)
print(f'Accuracy is: {100*right/(right + wrong)}')

Prediction: 0, probability: [0.17861591]
Actual value: 0.0
Prediction: 0, probability: [0.00787536]
Actual value: 0.0
Prediction: 0, probability: [0.01549401]
Actual value: 0.0
Prediction: 0, probability: [0.0032894]
Actual value: 0.0
Prediction: 0, probability: [0.01788543]
Actual value: 0.0
Prediction: 0, probability: [0.01620768]
Actual value: 0.0
Prediction: 0, probability: [0.01534767]
Actual value: 0.0
Prediction: 1, probability: [0.95609703]
Actual value: 1.0
Prediction: 0, probability: [0.00837491]
Actual value: 0.0
Prediction: 1, probability: [0.94481881]
Actual value: 1.0
Prediction: 0, probability: [0.00217296]
Actual value: 0.0
Prediction: 0, probability: [0.00585564]
Actual value: 0.0
Prediction: 1, probability: [0.99708682]
Actual value: 1.0
Prediction: 1, probability: [0.99516274]
Actual value: 1.0
Prediction: 1, probability: [0.98407712]
Actual value: 1.0
Prediction: 1, probability: [0.97330188]
Actual value: 1.0
Prediction: 0, probability: [0.00721848]
Actual value: 0.