In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [2]:
cancer_data = load_breast_cancer()

In [3]:
print(cancer_data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
features = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
target = pd.DataFrame(cancer_data.target, columns=['Target'])

In [5]:
features.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
target.head()

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=10)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(426, 30)
(143, 30)
(426, 1)
(143, 1)


In [10]:
my_logreg_model = LogisticRegression().fit(X_train, y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
my_logreg_model.coef_, my_logreg_model.intercept_

(array([[ 3.64917675e-01,  1.88200047e-01,  7.58081479e-01,
         -3.45964697e-02, -6.35291977e-03, -4.11896994e-02,
         -6.20498599e-02, -2.45640785e-02, -6.53357062e-03,
         -1.36584789e-03,  9.82327597e-03,  6.38988523e-02,
         -7.34134737e-02, -1.05837220e-01, -4.92912360e-04,
         -8.59445188e-03, -1.32647149e-02, -3.34881197e-03,
         -7.12009121e-04, -7.12502266e-04,  3.71547029e-01,
         -3.59831806e-01, -3.77372978e-01, -1.22044877e-02,
         -1.30563749e-02, -1.36492405e-01, -1.79079510e-01,
         -4.99528990e-02, -2.59233254e-02, -1.24603712e-02]]),
 array([0.05867029]))

In [11]:
my_logreg_preds_train = my_logreg_model.predict(X_train)
my_logreg_preds_test = my_logreg_model.predict(X_test)

print('Accuracy on Train set : ',accuracy_score(y_train, my_logreg_preds_train))
print('Accuracy on Test set : ',accuracy_score(y_test, my_logreg_preds_test))
      

Accuracy on Train set :  0.9413145539906104
Accuracy on Test set :  0.9300699300699301


In [16]:
my_logreg_model.predict_proba(X_train)

array([[5.83270126e-04, 9.99416730e-01],
       [4.01634037e-04, 9.99598366e-01],
       [1.00000000e+00, 5.88798221e-12],
       [6.20818898e-05, 9.99937918e-01],
       [6.11410463e-03, 9.93885895e-01],
       [5.29365242e-02, 9.47063476e-01],
       [3.03782419e-05, 9.99969622e-01],
       [9.99926015e-01, 7.39852772e-05],
       [1.05139396e-04, 9.99894861e-01],
       [1.00000000e+00, 7.87393783e-11],
       [1.60039899e-01, 8.39960101e-01],
       [1.10106902e-04, 9.99889893e-01],
       [9.09570468e-03, 9.90904295e-01],
       [9.99999937e-01, 6.25551603e-08],
       [1.53342566e-03, 9.98466574e-01],
       [2.69011253e-03, 9.97309887e-01],
       [9.99941220e-01, 5.87803950e-05],
       [6.59181722e-04, 9.99340818e-01],
       [7.36289539e-05, 9.99926371e-01],
       [7.06319739e-04, 9.99293680e-01],
       [1.59511216e-03, 9.98404888e-01],
       [6.41670443e-03, 9.93583296e-01],
       [9.99909184e-01, 9.08155164e-05],
       [1.00000000e+00, 2.31624329e-22],
       [2.508385

In [12]:
print('Precision on Train set : ',precision_score(y_train, my_logreg_preds_train))
print('Precision on Test set : ',precision_score(y_test, my_logreg_preds_test))

print('\n')

print('Recall on Train set : ',recall_score(y_train, my_logreg_preds_train))
print('Recall on Test set : ',recall_score(y_test, my_logreg_preds_test))

print('\n')

print('F1-Score on Train set : ',f1_score(y_train, my_logreg_preds_train))
print('F1-Score on Test set : ',f1_score(y_test, my_logreg_preds_test))

Precision on Train set :  0.9446494464944649
Precision on Test set :  0.9550561797752809


Recall on Train set :  0.9624060150375939
Recall on Test set :  0.9340659340659341


F1-Score on Train set :  0.9534450651769086
F1-Score on Test set :  0.9444444444444444


In [13]:
print('Classification Report on Train set : ')
print(classification_report(y_train, my_logreg_preds_train))

Classification Report on Train set : 
              precision    recall  f1-score   support

           0       0.94      0.91      0.92       160
           1       0.94      0.96      0.95       266

    accuracy                           0.94       426
   macro avg       0.94      0.93      0.94       426
weighted avg       0.94      0.94      0.94       426



In [14]:
print('Classification Report on Test set : ')
print(classification_report(y_test, my_logreg_preds_test))

Classification Report on Test set : 
              precision    recall  f1-score   support

           0       0.89      0.92      0.91        52
           1       0.96      0.93      0.94        91

    accuracy                           0.93       143
   macro avg       0.92      0.93      0.93       143
weighted avg       0.93      0.93      0.93       143

