In [None]:
# Basic imports
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd

In [None]:
# Notebooks have so-called "magic" commands
%matplotlib inline

In [None]:
# Data loading
data = sklearn.datasets.load_breast_cancer(as_frame=True)
data_as_DataFrame = data.frame
# Alias to something easier to work with
df = data_as_DataFrame

In [None]:
# Have a look at the data
print(df)

In [None]:
# Separate data based on label
malignant_x_points = df['mean perimeter'][df['target'] == 0]
malignant_y_points = df['mean area'][df['target'] == 0]
benign_x_points = df['mean perimeter'][df['target'] == 1]
benign_y_points = df['mean area'][df['target'] == 1]
labels = df['target']
legend_text = [str(target_name) for target_name in list(data.target_names)]

In [None]:
# Fit a Linear Regression
LR_benign = sklearn.linear_model.LinearRegression()
benign_model = LR_benign.fit(benign_x_points.to_frame(), benign_y_points.to_frame())
benign_line = benign_model.predict(benign_x_points.to_frame())

LR_malignant = sklearn.linear_model.LinearRegression()
malignant_model = LR_malignant.fit(malignant_x_points.to_frame(), malignant_y_points.to_frame())
malignant_line = malignant_model.predict(malignant_x_points.to_frame())

In [None]:
# Create a plot colored as a function of benign/malignant label
plt.scatter(malignant_x_points, malignant_y_points, c='purple', label="Malignant", alpha=0.2)
plt.scatter(benign_x_points, benign_y_points, c='yellow', label="Benign", alpha=0.2)
plt.plot(malignant_x_points, malignant_line, color='black', linewidth=1, label='Malignant model', alpha=0.7)
plt.plot(benign_x_points, benign_line, color='blue', linewidth=1, label='Benign model', alpha=0.7)
plt.xlabel('mean perimeter (mm)')
plt.ylabel('mean area (mm)')
plt.legend()
plt.show()

In [None]:
# Begin PCA by importing dependency
from sklearn.decomposition import PCA

In [None]:
# Construct the PCA object and perform fit to adjust object internal state
pca = PCA(n_components=3)
pca.fit(data.data)

data_reduced = PCA(n_components=3).fit_transform(data.data)

# Introspect into the results of the calculations
print(f"Explained variance: {pca.explained_variance_}")
print(f"Principal Components:\n{pca.components_}")

In [None]:
## Plot the resulting fit in 3 dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110)

scatter = ax.scatter(
    data_reduced[:, 0],
    data_reduced[:, 1],
    data_reduced[:, 2],
    c=data.target,
    alpha=0.6
)

ax.set(
    title="First three principal components",
    xlabel="1st Principal Component",
    ylabel="2nd Principal Component",
    zlabel="3rd Principal Component",
)
ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.zaxis.set_ticklabels([])

# Add a legend
legend1 = ax.legend(
    scatter.legend_elements()[0],
    data.target_names.tolist(),
    loc="upper right",
    title="Cancer class",
)
ax.add_artist(legend1)

plt.show()

In [None]:
# Begin the Decision Tree Classifier example
# Import requirements
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Re-sample the original data and perform a train(ing)/80%, test(ing)/20% data split
x = data.data
y = data.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the classifier
tree_classifier = DecisionTreeClassifier(max_depth=5, random_state=42)
# Fit data within the classifier
tree_classifier.fit(x_train, y_train)
# Obtain predictions from the classifier
label_prediction = tree_classifier.predict(x_test)
# Chack for overall accuracy with sklearn's built-in tool
accuracy = accuracy_score(y_test, label_prediction)
# Print out the result
# Note the ":.2f" - this tells python to round (the floating-point) 
# to 2 decimal places
print(f"Tree accuracy: {accuracy * 100:.2f}%")

In [None]:
# Plot what form the tree takes
plot_tree(tree_classifier)

In [None]:
# Begin the (Guassian) Naive Bayes model
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [None]:
# Initialize the Naive Bayes object
gnb = GaussianNB()

In [None]:
# Fit the data with the resulting object
gnb.fit(x_train, y_train)
# Obtain predictions from the model
y_pred = gnb.predict(x_test)
# Print out the result
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.2f}%")

In [None]:
# Artificial Neural Network example
# Import requirements from PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

In [None]:
# Reformat data to a torch-friendly format, and set the batch size
# (ie, how many points to train on at once)
batch_size = 10
x_tr_tensor = torch.from_numpy(x_train.to_numpy()).to(torch.float32)
# 'Normalize' data
x_tr_tensor = F.normalize(x_tr_tensor, p=2.0, dim=1)
y_tr_tensor = torch.from_numpy(y_train.to_numpy()).to(torch.float32).unsqueeze(1)

train_dataset = TensorDataset(x_tr_tensor, y_tr_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Create a 3-layer neural network with a "ReLU" activation function
# ReLU is a "rectified linear unit". There are other
# popular functions, such as sigmoid 
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(30, 48) # Input layer (30 features) to a hidden layer (48 nodes)
        
        ## Note here that the choice of activation function is critical to
        ## the success of the model. ReLU are popular for large models, 
        ## but for small models like this Sigmoid is likely more appropriate

        ## Try both to see how accuracy changes!

        self.relu = nn.ReLU()
        # self.relu = nn.Sigmoid()
        self.fc2 = nn.Linear(48, 1)  # Hidden layer (48 nodes) to output layer (1 output)

    # The model is responsible for handling the "forward pass"
    # training
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleNN()

In [None]:
# Set up the loss function (penalty for incorrect labels)
criterion = nn.BCEWithLogitsLoss() # For binary classification tasks
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
num_epochs = 100

for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        # 1. Clear residual gradients from the previous iteration
        optimizer.zero_grad()

        # 2. Make a Forward Pass and get the output (predictions)
        outputs = model(inputs)

        # 3. Calculate the loss
        loss = criterion(outputs, labels)

        # 4. Perform a Backward pass to calculate gradients
        loss.backward()

        # 5. Run the optimizer to update the weights
        optimizer.step()

    # Print loss every 10 epochs (optional)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print('Finished Training')

In [None]:
# Convert test data to torch-friendly format
x_test_tensor = torch.from_numpy(x_test.to_numpy()).to(torch.float32)
# 'Normalize' data
x_test_tensor = F.normalize(x_test_tensor, p=2.0, dim=1)
y_test_tensor = torch.from_numpy(y_test.to_numpy()).to(torch.float32).unsqueeze(1)

train_dataset = TensorDataset(x_test_tensor, y_tr_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Move the model to "eval" mode
# This changes internal state to avoid accidental training
model.eval()
# Get predictions from the test set
# First, convert from model outputs (-inf-inf range)
# to probabilities
label_predicted_probabilities = F.sigmoid(model(x_test_tensor))
# Convert probabilities to either 0 or 1, here cutting off at 0.5
predicted_labels = (label_predicted_probabilities >= 0.5).int()

In [None]:
# Compute accuracy where the label matches ground truth
correct_predictions = (predicted_labels == y_test_tensor).float()
accuracy = correct_predictions.mean().item()
print(f"Accuracy: {accuracy * 100:.2f}%")