In [149]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer

In [150]:
#load the data and save it to df
breast_ca = load_breast_cancer()

In [151]:
#dataset summary stats & characteristics 
print(breast_ca.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [152]:
#feature names
print('feature_names: ' + str(breast_ca.feature_names))

feature_names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [153]:
#target names
print('target_names: ' + str(breast_ca.target_names))

target_names: ['malignant' 'benign']


In [154]:
#load data in a df to load coef & and intercept
df = pd.DataFrame(breast_ca.data, columns=breast_ca.feature_names)
df['target'] = breast_ca.target

#separate X and y 
X = df[['worst concavity', 'worst area']]
print(X.head(5))
y = df[['target']]
print(y.head(5))

   worst concavity  worst area
0           0.7119      2019.0
1           0.2416      1956.0
2           0.4504      1709.0
3           0.6869       567.7
4           0.4000      1575.0
   target
0       0
1       0
2       0
3       0
4       0


In [156]:
#split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, test_size=0.30, random_state=30)

In [155]:
#transform X using scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#fit the scaler to the data
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [157]:
#get the cross validation score to see how well the model is performing with 4 folds
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

# Reshape y and y_train to (n_samples --change the shape of the array to 1D array for model)
y = np.ravel(y)
y_train = np.ravel(y_train)


#calculate the cross-validated accuracy score of 4 folds
cv_scores = cross_val_score(log_reg, X, y, cv=4)

#calculate the mean accuracy across all folds
mean_cv_score = np.mean(cv_scores)

#print the scores
print('Cross-validated accuracy scores: ' + str(cv_scores))
print('Mean cross-validated accuracy score: ' + str(mean_cv_score))

Cross-validated accuracy scores: [0.93006993 0.93661972 0.95070423 0.93661972]
Mean cross-validated accuracy score: 0.9385033980104402


In [158]:
#fit the log_reg model to training data
log_reg.fit(X_train, y_train)

LogisticRegression()

In [159]:
#test the intercept and coefficients
print('Intercept: ' + str(log_reg.intercept_))
print('Coefficients: ' + str(log_reg.coef_))

Intercept: [0.33808955]
Coefficients: [[-1.48290324 -4.28202054]]


## Both coefficients are negative, intercept is positive: 
### In this instance, the negative coefficient means that as the absolute value of the feature increases, the likelihood of the patient having malignant breast cancer (target value= 1) increases. The 'worst area' has the higher absolute value between the coefficients; meaning feature further impacts the increased likelihood of breast cancer.

In [160]:
#get prediction
y_pred = log_reg.predict(X_test)
print('Predicted results: ')
print(y_pred)

Predicted results: 
[0 0 1 0 1 0 1 1 1 1 1 0 1 0 0 1 0 0 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 0 1 0 1 1
 0 0 1 1 0 0 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 1 1 1 1 0 1
 1 1 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1
 0 0 1 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 0]


In [161]:
#get real results
print('Real results: ')
print(y_test)

Real results: 
     target
197       0
351       0
338       1
47        0
474       1
..      ...
433       0
217       1
401       1
334       1
212       0

[171 rows x 1 columns]


In [162]:
#calculate confusion matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred))
print('57 TN, 5 FP, 3 FN, 106 TP')

Confusion Matrix: 
[[ 57   5]
 [  3 106]]
57 TN, 5 FP, 3 FN, 106 TP


In [163]:
#get the accuracy score and F1 score stats on confusion matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

print('Accuracy Score: ' + str(accuracy_score(y_test, y_pred)))
print('F1 Score: ' + str(f1_score(y_test, y_pred)))

Accuracy Score: 0.9532163742690059
F1 Score: 0.9636363636363636
