In [1]:
!pip install scikit-learn




In [2]:
pip install ucimlrepo


Collecting ucimlrepo
  Obtaining dependency information for ucimlrepo from https://files.pythonhosted.org/packages/3b/07/1252560194df2b4fad1cb3c46081b948331c63eb1bb0b97620d508d12a53/ucimlrepo-0.0.7-py3-none-any.whl.metadata
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Using cached ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [145]:
# Import
import numpy as np
import pandas as pd

In [159]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 

# metadata 
#print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
#print(breast_cancer_wisconsin_diagnostic.variables) 


In [161]:
from sklearn.model_selection import train_test_split


# Split data into features (X) and target (y) (quality)

# Make data into matrices
X = X.to_numpy()
y = y.to_numpy()
#change so 0 is negative, 1 is positive
y = np.where(y == 'M', 0, 1)
# Normalize
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X = (X-X_mean) / X_std

# add w0 element
n,m = X.shape
X0 = np.ones((n,1))
X = np.hstack((X0,X))

# 80% for training+validation and 20% for testing
X_train_val, X_testM, y_train_val, y_testM = train_test_split(X, y, test_size=0.15, random_state=42)

# The 80% (training+validation) into 60% for training and 20% for validation
X_trainM, X_valM, y_trainM, y_valM = train_test_split(X_train_val, y_train_val, test_size=0.20, random_state=42)



In [257]:
rows, cols = y_trainM.shape
totalpositive =0
for i in range(rows):
    totalpositive += y_trainM[i]

print(f"Total class 1 in train: {totalpositive} ({100*totalpositive/rows}%)")

rows, cols = y_valM.shape
totalpositive =0
for i in range(rows):
    totalpositive += y_valM[i]

print(f"Total class 1 in validation: {totalpositive} ({100*totalpositive/rows}%)")


# Shape
print(f"Training set size: {X_trainM.shape}")
print(f"Validation set size: {X_valM.shape}")
print(f"Test set size: {X_testM.shape}")

Total class 1 in train: [238] ([61.65803109]%)
Total class 1 in validation: [65] ([67.01030928]%)
Training set size: (386, 31)
Validation set size: (97, 31)
Test set size: (86, 31)


In [229]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def calculate_gradient(X_batch, y_batch, w):
    m = len(y_batch)
    predictions = sigmoid(X_batch @ w)
    predictions = predictions.reshape(-1, 1)
    error = predictions - y_batch
    gradient = np.dot(X_batch.T, error) / m   #DeltaE(w)
    return gradient

# Mini-batch SGD for logistic regression
def mini_batch_SGD(X, y, batchsize, stepsize, iterations):
    m, n = X.shape  # Rows (m) and columns (n)
    w = np.random.randn(n)

    
    for iteration in range(iterations):
        indices = np.random.permutation(m)  # random seed
        X = X[indices]  # Shuffle according to seed
        y = y[indices]

        
        for start in range(0, m, batchsize):
            end = start + batchsize
            X_batch = X[start:end]
            y_batch = y[start:end]
            
            
            gradient = calculate_gradient(X_batch, y_batch, w)
            gradient = gradient.flatten()
            # Update weights
            w -= stepsize * gradient

    
    return w

# Hyperparameters
stepsize = 0.001 
iterations = 2000
batchsize = 25


# Train logistic regression model with mini-batch SGD
weights = mini_batch_SGD(X_trainM, y_trainM, batchsize, stepsize, iterations)
print("Final weights:", weights)


Final weights: [ 0.53734166 -0.46521743 -0.86030484 -1.84474287 -0.41636978 -0.71157863
 -0.14536457 -0.51539254 -0.21343997 -0.52844822 -0.29252604 -0.33516121
  1.12273123 -0.7750341  -1.80471966 -0.17867108  0.19880618 -0.50511489
  1.18236944  0.21567183 -0.15535129  0.46090783 -1.36081598  0.18595913
 -0.96282072 -0.68198178 -0.00912284 -0.48069311 -1.48084612 -0.48731509
  1.54039236]


In [253]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# predict values using weightvector w.
def predict(w,X):
    probabilities = sigmoid(X @ w)
    return (probabilities >= 0.5).astype(int)

ypredictionsTrain = predict(weights, X_trainM)
ypredictionsTrain = ypredictionsTrain.reshape(-1, 1)

#get evaluation metrics
accuracy = accuracy_score(y_trainM, ypredictionsTrain)
precision = precision_score(y_trainM, ypredictionsTrain)
recall = recall_score(y_trainM, ypredictionsTrain)
f1 = f1_score(y_trainM, ypredictionsTrain)


print(f"Model Performance on Train Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Model Performance on Train Set:
Accuracy: 0.9793
Precision: 0.9792
Recall: 0.9874
F1-score: 0.9833


In [231]:
ypredictionsVal = predict(weights, X_valM)
ypredictionsVal = ypredictionsVal.reshape(-1, 1)

#get evaluation metrics
accuracy = accuracy_score(y_valM, ypredictionsVal)
precision = precision_score(y_valM, ypredictionsVal)
recall = recall_score(y_valM, ypredictionsVal)
f1 = f1_score(y_valM, ypredictionsVal)


print(f"Model Performance on Validation Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Model Performance on Validation Set:
Accuracy: 0.9691
Precision: 0.9559
Recall: 1.0000
F1-score: 0.9774


In [251]:
ypredictionsTest = predict(weights, X_testM)
ypredictionsTest = ypredictionsTest.reshape(-1, 1)

#get evaluation metrics
accuracy = accuracy_score(y_testM, ypredictionsTest)
precision = precision_score(y_testM, ypredictionsTest)
recall = recall_score(y_testM, ypredictionsTest)
f1 = f1_score(y_testM, ypredictionsTest)


print(f"Model Performance on Test Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Model Performance on Test Set:
Accuracy: 0.9884
Precision: 0.9818
Recall: 1.0000
F1-score: 0.9908


In [None]:
##  Here the recall metric is very accurate, which is super important when it comes to diagnosis of breast cancer. 
##  This means that almost no patients with breast cancer get a negative result when conducting the test = almost no false negatives
##  Precition is not as important but the metric is still good, the test is very accurate with 98% precition on the test data. 
##  It is fortunate that we got even better results on test data than on the train data.