In [79]:
from sklearn.datasets import load_breast_cancer 
the_data = load_breast_cancer()    

In [80]:
print(the_data['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [81]:
# 0 – "benign" 
# 1 – "malignant" 
relabeled_target = 1 - the_data["target"] 

from sklearn.model_selection import train_test_split 
X = the_data["data"] 
y = relabeled_target 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

About 37% of data are attributed to class `malignant`:

In [82]:
y_train.mean(), y_test.mean()

(0.3732394366197183, 0.3706293706293706)

### Dummy classifier

In [83]:
from sklearn.dummy import DummyClassifier 
dc_mf = DummyClassifier(strategy="most_frequent") 
dc_mf.fit(X_train, y_train) 

### Confusion matrix

In [84]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score, roc_auc_score
import pandas as pd

def confusion_matrix2df(y_true, y_pred):
    dc_mf_tn, dc_mf_fp, dc_mf_fn, dc_mf_tp = confusion_matrix(y_true, y_pred, labels = [0, 1]).ravel() 
    return pd.DataFrame([[dc_mf_tp, dc_mf_fn], [dc_mf_fp, dc_mf_tn]], columns=['Pred +', 'Pred -'], index=["True +", "True -"])

In [85]:
y_pred = dc_mf.predict(X_test)
confusion_matrix2df(y_test, y_pred)

Unnamed: 0,Pred +,Pred -
True +,0,53
True -,0,90


In [86]:
def print_metrics(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("f1_score:", f1_score(y_true, y_pred))
    print("roc_auc_score:", roc_auc_score(y_true, y_pred))

In [87]:
print_metrics(y_true, y_pred)

Accuracy: 0.6293706293706294
Precision: 0.0
Recall: 0.0
f1_score: 0.0
roc_auc_score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest

In [88]:
from sklearn.ensemble import RandomForestClassifier 
rfc = RandomForestClassifier()       
rfc.fit(X_train, y_train)       
y_true = y_test       
y_pred = rfc.predict(X_test)       
confusion_matrix2df(y_true, y_pred)

Unnamed: 0,Pred +,Pred -
True +,52,1
True -,4,86


In [89]:
print_metrics(y_true, y_pred)

Accuracy: 0.965034965034965
Precision: 0.9285714285714286
Recall: 0.9811320754716981
f1_score: 0.9541284403669724
roc_auc_score: 0.9683438155136268


### SVM

In [90]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler 
ss = StandardScaler() 
ss.fit(X_train) 
scaled_linsvc = LinearSVC(C=0.01,random_state=42) 
scaled_linsvc.fit(ss.transform(X_train), y_train) 
y_true = y_test 
y_pred = scaled_linsvc.predict(ss.transform(X_test)) 
confusion_matrix2df(y_true, y_pred)

Unnamed: 0,Pred +,Pred -
True +,50,3
True -,1,89


In [91]:
print_metrics(y_true, y_pred)

Accuracy: 0.972027972027972
Precision: 0.9803921568627451
Recall: 0.9433962264150944
f1_score: 0.9615384615384616
roc_auc_score: 0.9661425576519916


### Logistic Regression

In [92]:
from sklearn.linear_model import LogisticRegression

lrc = LogisticRegression(max_iter=10000)       
lrc.fit(X_train, y_train)       
y_true = y_test       
y_pred = lrc.predict(X_test)       
confusion_matrix2df(y_true, y_pred)

Unnamed: 0,Pred +,Pred -
True +,52,1
True -,6,84


In [93]:
print_metrics(y_true, y_pred)

Accuracy: 0.951048951048951
Precision: 0.896551724137931
Recall: 0.9811320754716981
f1_score: 0.9369369369369369
roc_auc_score: 0.9572327044025157


In [94]:
lrc.predict_proba(X_test) 

array([[4.77319937e-03, 9.95226801e-01],
       [9.67991011e-01, 3.20089888e-02],
       [9.98480703e-01, 1.51929660e-03],
       [8.40969821e-01, 1.59030179e-01],
       [9.99966717e-01, 3.32825821e-05],
       [9.99301768e-01, 6.98231789e-04],
       [9.96966046e-01, 3.03395422e-03],
       [9.99441118e-01, 5.58881570e-04],
       [9.99898476e-01, 1.01523690e-04],
       [9.99984471e-01, 1.55291036e-05],
       [2.60605021e-01, 7.39394979e-01],
       [7.94048937e-01, 2.05951063e-01],
       [9.99913207e-01, 8.67934428e-05],
       [2.47335496e-01, 7.52664504e-01],
       [6.40254026e-01, 3.59745974e-01],
       [8.07639707e-03, 9.91923603e-01],
       [9.99544412e-01, 4.55587615e-04],
       [1.56533192e-07, 9.99999843e-01],
       [3.75852099e-03, 9.96241479e-01],
       [3.38148398e-10, 1.00000000e+00],
       [6.07931873e-05, 9.99939207e-01],
       [3.27231757e-02, 9.67276824e-01],
       [9.97895308e-01, 2.10469163e-03],
       [9.95815512e-01, 4.18448756e-03],
       [1.379434

### Using threshold for predictions

In [95]:
import numpy as np

for threshold in np.arange(0.05, 1, step=0.05):
    probs = lrc.predict_proba(X_test)
    y_pred = (probs[:, 1] > threshold).astype(float)
    print(threshold)
    print_metrics(y_true, y_pred)

0.05
Accuracy: 0.8601398601398601
Precision: 0.726027397260274
Recall: 1.0
f1_score: 0.8412698412698413
roc_auc_score: 0.8888888888888888
0.1
Accuracy: 0.8811188811188811
Precision: 0.7571428571428571
Recall: 1.0
f1_score: 0.8617886178861789
roc_auc_score: 0.9055555555555556
0.15000000000000002
Accuracy: 0.8951048951048951
Precision: 0.7794117647058824
Recall: 1.0
f1_score: 0.8760330578512397
roc_auc_score: 0.9166666666666667
0.2
Accuracy: 0.9090909090909091
Precision: 0.803030303030303
Recall: 1.0
f1_score: 0.8907563025210083
roc_auc_score: 0.9277777777777778
0.25
Accuracy: 0.9230769230769231
Precision: 0.8387096774193549
Recall: 0.9811320754716981
f1_score: 0.9043478260869565
roc_auc_score: 0.9350104821802934
0.3
Accuracy: 0.9300699300699301
Precision: 0.8524590163934426
Recall: 0.9811320754716981
f1_score: 0.912280701754386
roc_auc_score: 0.940566037735849
0.35000000000000003
Accuracy: 0.9300699300699301
Precision: 0.8524590163934426
Recall: 0.9811320754716981
f1_score: 0.9122807017

In [102]:
A = np.array([[1, 1, 1], [1, 2, 2], [1, 2, 3]])
A

array([[1, 1, 1],
       [1, 2, 2],
       [1, 2, 3]])

In [103]:
np.linalg.inv(A)

array([[ 2., -1.,  0.],
       [-1.,  2., -1.],
       [ 0., -1.,  1.]])