In [1]:
from sklearn.cross_validation import train_test_split, StratifiedKFold
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt



## Defining Required Functions 

In [2]:
# Defining the sigmoid function to get likelihood of belonging to a class
def sigmoid_classifier(prob):
    return (1 / (1 + np.exp(-prob)))

In [3]:
# Defining Logistic Regression without using Scikit Learn
def logistic_regression_classifier(predictors, response, iterations, learning_rate):

    betas = np.zeros(predictors.shape[1])
    
    for iteration in range(iterations):
        scores = np.dot(predictors, betas)
        predictions = sigmoid_classifier(scores)

        # Update coefficients
        epsilon = response - predictions
        gradient = np.dot(predictors.T, epsilon)
        betas += learning_rate * gradient
        
    return betas

## Data loading 

In [4]:
# Using openly available cancer dataset
cancer = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header = None)

## Transformation and Model Fitting

In [5]:
# Mapping the classification variable to numeric
conv = {'B':0,'M':1}
cancer[32] = cancer[1].map(conv)

# Separating predictors and response variable
X = cancer.loc[:,2:31] ## Data points that act as predictors
y = cancer.loc[:,32] ## Target variable coded into 0 and 1 from Malignant and Benign

# Splitting the data set into test and train using Scikit Learn's test-train split function
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.20)

# Normalising the data
X_train_min = X_train.min()
X_train_max = X_train.max()
X_train = (X_train - X_train.min())/(X_train.max() - X_train.min())
X_test = (X_test - X_train_min)/(X_train_max - X_train_min)

# Calling logistic regression function
result = logistic_regression_classifier(X_train,y_train,300000, 5e-5)

In [6]:
final_scores = np.dot(X_train, result)
preds = np.round(sigmoid_classifier(final_scores))

print('Train Set Accuracy: {0}'.format((preds == y_train).sum().astype(float) / len(preds)))

Train Set Accuracy: 0.9648351648351648


In [7]:
final_scores_test = np.dot(X_test, result)
preds_test = np.round(sigmoid_classifier(final_scores_test))

print('Test Set Accuracy: {0}'.format((preds_test == y_test).sum().astype(float) / len(preds_test)))

Test Set Accuracy: 0.956140350877193
