https://colab.research.google.com/github/Clinical-Informatics-Interest-Group/CLiC.notebooks/blob/main/notebooks/notebook1.ipynb

In [None]:
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Clinical-Informatics-Interest-Group/CLiC.notebooks/blob/main/notebooks/notebook2.ipynb)

# Extremely Short intro to Python
### (and Jupyter Notebooks)

Python is one of the most popular Machine Learning languages in academia and industry.  
It's also thankfully easier to read than many other programming languages.

In [None]:
# Place your cursor in this cell and press 'shift' + 'enter'
print("Hello, medical students!")

In [None]:
# print("Hello, medical students")

In [None]:
# Nothing happened there ^ because the line started with '#'.
# Coders call this "commenting out" code. In python, any line that starts with '#' is
# not evaluated.

In [None]:
# Python does math well, too.
1 + 1

In [None]:
# If you want more information about a function from Jupyter, simply place a '?' in front
# of it and evaulate it with 'shift+enter'.
?print

# [Breast Cancer Wisconsin Dataset](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic))
"Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image."  
https://scikit-learn.org/stable/datasets/toy_dataset.html

# First let's train our own neurons
![img](./images/histology.webp)
Rakha, Emad & Reis-Filho, Jorge & Baehner, Frederick & Dabbs, David & Decker, Thomas & Eusebi, Vincenzo & Fox, Stephen & Ichihara, Shu & Jacquemier, Jocelyne & Lakhani, Sr & Palacios, José & Richardson, Andrea & Schnitt, Stuart & Schmitt, Fernando & Tan, Puay-Hoon & Tse, Gary & Badve, Sunil & Ellis, Ian. (2010). Breast cancer prognostic classification in the molecular era: the role of histological grade. Breast cancer research : BCR. 12. 207. 10.1186/bcr2607. 

In [1]:
# These import statements bring new functions for us to use.
# This saves us from having to write them ourselves.
from sklearn import datasets
import pandas as pd

# Let's use a function from 'datasets' to load the breast cancer data
# and assign it to 'tumor'
tumor = datasets.load_breast_cancer()

In [2]:
# In order to "Hold Out" half the data for validation
# we need to write some code that allows us to separate
# it at the halfway point.

# Find the length of the data set
length = len(tumor.data)
# Halfway point
midpoint = (length // 2)
# New start point
secondhalf = midpoint + 1

In [14]:
# Assign the data features (independant variables) to a matrix 'X'
# which the Perceptron will make predictions on. 'y' is the ground
# truth the Perceptron will compare it's predictions to.
X = pd.DataFrame(tumor.data[:midpoint, :])
y = tumor.target[:midpoint]
# We'll hold some data from model testing in order to test its "real world" performance
X_validate = pd.DataFrame(tumor.data[secondhalf:, :])
y_validate = tumor.target[secondhalf:]

In [15]:
from sklearn.model_selection import train_test_split

# Here, we tell 'train_test_split' to take 30% of our training
# data set and assign it to X_test. X_train is the data the perceptron
# will learn from by predicting malignant or not. X_test is the data
# internally validates the trained algorithm.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y)

In [16]:
# Evaluate 'X' to see the data in a nice table
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,...,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0
mean,14.548789,19.429648,95.006761,694.523239,0.098614,0.11279,0.102681,0.055858,0.18573,0.063019,...,17.017299,26.12912,112.371937,960.666901,0.136632,0.281742,0.309525,0.127427,0.302461,0.086318
std,3.607309,4.321089,24.868854,360.68946,0.013757,0.056942,0.083489,0.039458,0.029041,0.007552,...,4.97016,6.389541,34.543913,581.489238,0.023286,0.174084,0.21854,0.06703,0.06989,0.020447
min,6.981,9.71,43.79,143.5,0.06251,0.01938,0.0,0.0,0.1167,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.8975,16.3775,77.1525,437.1,0.089807,0.070155,0.037097,0.024362,0.1667,0.057513,...,13.3,21.3975,86.0375,543.775,0.122125,0.154075,0.1393,0.07274,0.261175,0.07295
50%,13.76,19.315,89.39,584.55,0.097765,0.10375,0.085465,0.050035,0.1842,0.061855,...,15.87,25.97,105.7,765.45,0.1365,0.2371,0.27455,0.12195,0.2905,0.081545
75%,17.0275,22.025,111.15,904.375,0.107025,0.144475,0.148175,0.083645,0.19935,0.066697,...,20.13,30.3075,133.05,1260.25,0.1504,0.384975,0.426725,0.1779,0.330925,0.095692
max,28.11,39.28,188.5,2499.0,0.1447,0.3454,0.4268,0.2012,0.304,0.09744,...,33.12,49.54,220.8,3432.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [17]:
# Calling the 'describe' method on X returns some useful information
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,...,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0
mean,14.548789,19.429648,95.006761,694.523239,0.098614,0.11279,0.102681,0.055858,0.18573,0.063019,...,17.017299,26.12912,112.371937,960.666901,0.136632,0.281742,0.309525,0.127427,0.302461,0.086318
std,3.607309,4.321089,24.868854,360.68946,0.013757,0.056942,0.083489,0.039458,0.029041,0.007552,...,4.97016,6.389541,34.543913,581.489238,0.023286,0.174084,0.21854,0.06703,0.06989,0.020447
min,6.981,9.71,43.79,143.5,0.06251,0.01938,0.0,0.0,0.1167,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.8975,16.3775,77.1525,437.1,0.089807,0.070155,0.037097,0.024362,0.1667,0.057513,...,13.3,21.3975,86.0375,543.775,0.122125,0.154075,0.1393,0.07274,0.261175,0.07295
50%,13.76,19.315,89.39,584.55,0.097765,0.10375,0.085465,0.050035,0.1842,0.061855,...,15.87,25.97,105.7,765.45,0.1365,0.2371,0.27455,0.12195,0.2905,0.081545
75%,17.0275,22.025,111.15,904.375,0.107025,0.144475,0.148175,0.083645,0.19935,0.066697,...,20.13,30.3075,133.05,1260.25,0.1504,0.384975,0.426725,0.1779,0.330925,0.095692
max,28.11,39.28,188.5,2499.0,0.1447,0.3454,0.4268,0.2012,0.304,0.09744,...,33.12,49.54,220.8,3432.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [18]:
# It's important to standardize the numerical values in every feature, so that no feature is disproportionally weighted.
from sklearn.preprocessing import StandardScaler

# We should fit the 'Standard Scaler' to the data we're going to train the Perceptron on.
sc = StandardScaler()
sc.fit(X_train)
# And then standardize the data by transforming it with the fit 'Standard Scaler'
X_train_std = sc.transform(X_train)
# Now apply the same standardizing to the test and validation sets
X_test_std = sc.transform(X_test)
X_valid_std = sc.transform(X_validate)

In [19]:
from sklearn.linear_model import Perceptron

# Let's call the Perceptron we're about to train 'neuron', and give it
# two initial instructions (in the form of parameters). 'eta0' is the "learning
# rate", which tell's the Perceptron how much it should change its predictions
# each time it gets them wrong. 'random_state=1' tells the Perceptron to randomly
# weight each feature of the data from the start.
neuron = Perceptron(eta0=0.1, random_state=1)

# Next we'll tell neuron to learn from the data features 'X_train_std' by attempting to predict
# the outcome 'y_train'
neuron.fit(X_train_std, y_train)

Perceptron(eta0=0.1, random_state=1)

In [21]:
# Check the accuracy by scoring 'neuron' against the test data
neuron.score(X_test_std, y_test)

0.9534883720930233

In [38]:
# Let's see what happens if we only allow the Perceptron to learn
# from a single prediction.
weak_neuron = Perceptron(max_iter=1, eta0=0.1, random_state=1)
weak_neuron.fit(X_train_std, y_train)
weak_neuron.score(X_test_std, y_test)



0.9418604651162791

In [24]:
# Lastly, we validate the neuron with the validation
# data held out from training.
neuron.score(X_valid_std, y_validate)

0.954225352112676

# Breast Cancer Diagnosis by Fine Needle Aspirate
- Sensitivity: 74 percent (95% CI 72 to 77 percent)
- Specificity: 96 percent (95% CI 94 to 98 percent)

Wang M, He X, Chang Y, Sun G, Thabane L. "A sensitivity and specificity comparison of fine needle aspiration cytology and core needle biopsy in evaluation of suspicious breast lesions: A systematic review and meta-analysis." Breast. 2017;31:157. Epub 2016 Nov 17. 

In [48]:
from sklearn.metrics import confusion_matrix

# Explore this code to see how we can describe the
# sensitivity and specificity of our 'neuron'
y_validate_pred = neuron.predict(X_valid_std)
confusion_matrix(y_validate, y_validate_pred)

array([[ 66,   1],
       [ 12, 205]])

In [50]:
tn, fp, fn, tp = confusion_matrix(y_validate, y_validate_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print('Sensitivity : ', sensitivity)
print('Specificity : ', specificity)

Sensitivity :  0.9447004608294931
Specificity :  0.9850746268656716
