In [23]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier
from utils import standardize

Why logistic regression?
    - Logistic regression is used for binary classification. This suits our project as we are classifying whether a tumour is malignant (M) or benign (B).

How will we evaluate the model?
   - Bias (how accurate the model is)
   - Variance (how spread out the predictions are from the mean)

(This shows us if our model is overfitting / underfitting)

Since it's a classification algorithm, we evaluate it with:
- Accuracy
- Precision
- Recall

Confusion matrix to see the distribution of TP, FP, TN, FN

IMPORTANT:
- Cost of FN is much higher than TP, FP, TN
- Develop dummy model

Chi-squared tests to see which variables are useless?

In [24]:
df = pd.read_csv('./Data Exploration/wdbc.csv')
labels = df['B/M']
features = df[['Radius', 'Texture', 'Perimeter', 'Area', 'Smoothness', 'Compactness', 'Concavity', 'ConcavePoints', 'Symmetry', 'FractalDimension']]

In [25]:
# Standardize features
features = standardize(features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns if columns else df.select_dtypes(['float64', 'int64']).columns.values] = ColumnTransformer([


Distribution of B / M, we will use this result when comparing our model to a dummy model later

In [26]:
labels.value_counts()

B    357
M    212
Name: B/M, dtype: int64

In [27]:
# 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

In [28]:
# Fit model to training data
model = LogisticRegression().fit(x_train, y_train)
y_pred = model.predict(x_test)
# Accuracy
accuracy = model.score(x_test, y_test)

In [29]:
y_pred

array(['B', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'M',
       'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'M', 'B', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B', 'M',
       'M', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'M', 'B', 'B', 'M',
       'M', 'M', 'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'M', 'M', 'B',
       'M', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'M', 'M', 'M', 'B', 'M',
       'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'M', 'M', 'M', 'B',
       'B', 'B', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B'], dtype=object)

We're getting an accuracy of approx. 65%. Let's investigate the types of errors that we're getting (TP, FP, TN, FN) with a confusion matrix:

In [30]:
c = confusion_matrix(y_test, y_pred)

print(f'True negatives: {c[0][0]}')
print(f'False negatives: {c[1][0]}')
print(f'True positives: {c[1][1]}')
print(f'False positives: {c[0][1]}')

True negatives: 65
False negatives: 4
True positives: 39
False positives: 6


In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           B       0.94      0.92      0.93        71
           M       0.87      0.91      0.89        43

    accuracy                           0.91       114
   macro avg       0.90      0.91      0.91       114
weighted avg       0.91      0.91      0.91       114



In [32]:
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall_score(y_test, y_pred, pos_label="M")}')
print(f'Precision: {precision_score(y_test, y_pred, pos_label="M", zero_division=0)}')

Accuracy: 0.9122807017543859
Recall: 0.9069767441860465
Precision: 0.8666666666666667


Dummy classifier:
Our dummy model will classify data with the label that occurs most often. Does our model beat the dummy model?

In [33]:
dm = DummyClassifier()
dm.fit(x_train, y_train)
dummy_score = dm.score(x_test, y_test)
print(f'{"Our model beats the dummy model" if accuracy > dummy_score else "Our model does not beat the dummy model."}')

Our model beats the dummy model
