In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [None]:
columns = ['menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-mailig', 'breast', 'irradiat', 'target']
df = pd.read_csv('breast cancer.csv', header=None, names=columns)
df['target'] = df['target'] - 1

In [None]:
df.describe()

You can see that the values are normalized from 0 to 1

## Feature visualization
Let's plot the distributions of the features, conditioned on the target variable.

In [None]:
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(20, 20))
axs = axs.flatten()
for i, col in enumerate(df.columns):
    sns.histplot(data=df, x=col, hue='target', kde=True, palette='Set1', ax=axs[i])
    plt.plot()

I was unable to determine names of features but the following set seems possible and found descriptions of the features:
- 0 - menopause - 12 months after a women’s final period.
- 1 - tumor-size - the size of the cancer tumor at the time of diagnosis.
- 2 - inv-nodes - number of lymph nodes in the armpit that contain the spread of breast cancer visible.
- 3 - node-caps - though the outside of the tumor seems to be contained cancer may expose the risk of metastasis to the lymph node.
- 4 - deg-malig - Degree of malignancy – Grade of cancer that is visible under a microscope.
- 5 - breast - which side of the breast, does breast cancer occur.
- 6 - irradiat - Irradiation: treatment that destroys cancer cells.


In [None]:
X = df.drop(columns='target')
y = df['target']

## Distribution of a target

In [None]:
sns.histplot(y)

In [None]:
y.value_counts() / len(y)

There are more instances with a class: *no-recurrence-events* (0) than *recurrence-events* (1).\
In fact there are roughly 70% instances of 0 and 30% instances of 1. So our baseline accuracy is 70%.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
y_pred_prob = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
auc = roc_auc_score(y_test, y_pred_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")

Unfortunately, the results aren't very satisfying as we only slightly improved the baseline accuracy.

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=0.8)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=14)
plt.legend(loc='lower right')
plt.show()

In [None]:
clf.intercept_

As you can see the intercept is below zero, so the model is biased towards predicting *no-recurrence-events* (0), which makes sense as it is the **majority class**.

## Weight visualization

Let's plot the values for coefficients of the model to see which features are the most influencial in the results.

In [None]:
clf.coef_

In [None]:
plt.bar(x=X_train.columns, height=clf.coef_[0])
plt.xticks(rotation=45)
plt.title('Logistic regression weights')

In [None]:
import lime
import lime.lime_tabular

explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train.values,
    feature_names=X_train.columns,
    class_names=['no-recurrence-events', 'recurrence-events'],
    discretize_continuous=True,
    random_state=42
)

## Explanations of the decisions
### First instance

In [None]:
instance = X_test.iloc[0]
explanation = explainer.explain_instance(
    instance.values,
    clf.predict_proba,
    num_features=5
)

explanation.show_in_notebook(show_table=True, show_all=False)

As you can see this patient has the smallest possible value of node-caps, which is an influential feature in the model. The degree of malignancy is quite high which influences the model into predicting *recurrence-events* (1). However rest of the features also seem quite low which might be the reason why the model is biased towards predicting *no-recurrence-events* (0).

In [None]:
val = sum([clf.coef_[0][i] * instance[col] for i, col in enumerate(X_train.columns)]) + clf.intercept_
val

In [None]:
NODE_CAPS = 4
val / clf.coef_[0][NODE_CAPS]

In theory if we change the value of node-caps to around -0.65 we should observe the change of class. Let's check what happens after sampling such instance.

In [None]:
instance = instance.copy()
instance['node-caps'] = 0.7
explanation = explainer.explain_instance(
    instance.values,
    clf.predict_proba,
    num_features=5
)

explanation.show_in_notebook(show_table=True, show_all=False)

## Second instance (most in the direction of benign)

In [None]:
instance = X_test.iloc[67]
explanation = explainer.explain_instance(
    instance.values,
    clf.predict_proba,
    num_features=7
)

explanation.show_in_notebook(show_table=True, show_all=False)

I took the example for which the model predicted the highest probability of 0 (no-recurrence-events). As you can see basically all features with positive coefficients have low values while breast (the only one with negative coeff) has a high value. Overall adding the model of the bias we are obtaining a very low value which is equivalent to the model predicting 0.

In [None]:
instance = X_test.iloc[42]
explanation = explainer.explain_instance(
    instance.values,
    clf.predict_proba,
    num_features=7
)

explanation.show_in_notebook(show_table=True, show_all=False)

This is the example for which the model predicted the highest probability of 1 (recurrence-events). As you can see the values of the most important features are quite high. Node-caps, deg-mailig and menopause all obtain the highest value possible.

In [None]:
instance = instance.copy()
instance['node-caps'] = 0
explanation = explainer.explain_instance(
    instance.values,
    clf.predict_proba,
    num_features=7
)

explanation.show_in_notebook(show_table=True, show_all=False)

For this patient we cannot change the decision of the model by just modifying one feature. As all other features are "working" in favor of predicting 1, we would need to change multiple features to change the prediction.