In [391]:
import numpy as np
import pandas as pd
import time

In [392]:
train_data = pd.read_csv("1676558209_8416622_cleveland-train.csv")
train_data['heartdisease::category|-1|1'].replace({1:1, -1:0}, inplace = True)

X_train = train_data.drop('heartdisease::category|-1|1', axis = 1).to_numpy()
Y_train = train_data['heartdisease::category|-1|1'].values

# Add an intercept column to X
X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))

# Finding mean and standard deviation
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)

X_train.shape[1]

14

In [381]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def calculate_cost(X, y, weight):
    n_train_examples = len(y)
    z = np.dot(X, weight)
    h = sigmoid(z)
    cost = (-1/n_train_examples) * np.sum(y*np.log(h) + (1-y)*np.log(1-h))
    return cost

def calculate_gradient(X, y, weight):
    n_train_examples = len(y)
    z = np.dot(X, weight)
    h = sigmoid(z)
    gradient = (1/n_train_examples) * np.dot(X.T, (h-y))
    return gradient

def logistic_regression(X, y, X_mean, X_std, eta=1e-5, tol=1e-3, max_iter=10000):
    
    # Scaling the X_train data using Normalization
    X[:,1:] = (X[:,1:] - X_mean[1:]) / X_std[1:]

    # Initialize the weight vector
    w = np.zeros(X.shape[1])

    # Initialize the cost and gradient
    cost = calculate_cost(X, y, w)
    grad_mag = np.linalg.norm(calculate_gradient(X, y, w))

    iteration = 0
    while grad_mag >= tol and iteration < max_iter:
        # Compute the gradient and update the weight vector
        gradient = calculate_gradient(X, y, w)
        w -= eta * gradient

        # Compute the new cost and gradient magnitude
        new_cost = calculate_cost(X, y, w)
        grad_mag = np.linalg.norm(gradient)

        # Check if the magnitude of each term in the gradient is below the tolerance
        if grad_mag < tol:
            break

        # Update the iteration counter and cost
        iteration += 1
        cost = new_cost

    # Compute the final classification error on the training set
    h = sigmoid(np.dot(X, w))
    y_pred = np.round(h)
    class_error = np.mean(y_pred != y)

    return w, cost, class_error

In [386]:
start_time = time.time()
weight, cost, class_error = logistic_regression(X_train, Y_train, X_mean, X_std, eta=1e-5, tol=1e-3, max_iter=1000000)
end_time = time.time()
time_taken = end_time - start_time
print("Time taken to train the model: {:.2f} seconds".format(time_taken))

Time taken to train the model: 53.95 seconds


In [385]:
class_error

0.25

In [374]:
test_data = pd.read_csv("1676558209_8421676_cleveland-test.csv")
X_test = test_data.to_numpy()

X_test.shape

In [320]:
def predict(X_test, w):
    # Add an intercept column to X_test
    X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
    X_test[:,1:] = (X_test[:,1:] - X_mean[1:]) / X_std[1:]

    # Compute the predicted probabilities
    h = sigmoid(np.dot(X_test, w))

    # Round the probabilities to obtain binary predictions
    y_pred = np.round(h)

    return y_pred

In [321]:
y_pred = predict(X_test, weight).astype(int)
print(y_pred)

[1 1 0 1 1 1 0 0 0 1 0 0 0 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0 1 1 1 1 0 1 0 1 0
 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 1 0 0 1
 1 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0
 1 0 0 1 1 0 1 0 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0 1 1 1 0]


In [175]:
output = pd.DataFrame(y_pred)
print(output)

     0
0    0
1    1
2    0
3    1
4    1
..  ..
140  0
141  1
142  1
143  1
144  0

[145 rows x 1 columns]


In [176]:
output = output.replace({1:1, 0:-1})
print(output)

In [178]:
output.to_csv('submission2.csv', index=False, header=False, escapechar=None)

In [394]:
# Logistic Regression using Sklearn Library
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

train, valid=train_test_split(train_data, test_size=0.10, random_state=0)

X_train = train.drop('heartdisease::category|-1|1', axis = 1)
y_train = train['heartdisease::category|-1|1']
X_val = valid.drop('heartdisease::category|-1|1', axis = 1)
y_val = valid['heartdisease::category|-1|1']

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

start_time = time.time()
# Train logistic regression model
clf = LogisticRegression(dual=False, max_iter=1000000, C=1, penalty='l2')
clf.fit(X_train, y_train)
end_time = time.time()
time_taken = end_time - start_time
print("Time taken to train the model: {:.2f} seconds".format(time_taken))

# Predict classes
y_pred = clf.predict(X_val)

# Compute accuracy and log loss
acc = accuracy_score(y_val, y_pred)

print("Accuracy:", acc)

Time taken to train the model: 0.01 seconds
Accuracy: 0.875


In [248]:
pred = clf.predict(test_data)
output_library = pd.DataFrame(pred)
output_library = output_library.replace({1:1, 0:-1})

In [249]:
output_library

In [250]:
output_library.to_csv('submission3.csv', index=False, header=False, escapechar=None)