In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#  Load the dataset
df = pd.read_csv("Breast_Cancer_Data.csv")  # Change to your file name

#  Drop unwanted column
df = df.drop(columns=["Unnamed: 32", "id"])

#  Encode the target column
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})  # M = Malignant = 1, B = Benign = 0

#  Separate features and target
X = df.drop(columns=["diagnosis"])
y = df["diagnosis"]

#  check for missing values
print("Missing in X:", X.isnull().sum().sum())
print("Missing in y:", y.isnull().sum())

#  Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#  Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

#  Predict and Evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Missing in X: 0
Missing in y: 0
Accuracy: 0.9385964912280702
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95        72
           1       0.97      0.86      0.91        42

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [2]:
from sklearn.preprocessing import StandardScaler

#  Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#  Train-test split again with scaled data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

#  Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

#  Predict and evaluate
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9649122807017544
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97        72
           1       0.97      0.93      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



Tune threshold and explain sigmoid function.
The sigmoid function (also called the logistic function) is used to map predicted values to probabilities between 0 and 1.

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


In [5]:
X_test_scaled = scaler.transform(X_test)  # Don't use .fit_transform here!


In [6]:
# Predict probabilities using scaled test data
y_probs = model.predict_proba(X_test_scaled)[:, 1]

# Set a threshold (e.g., 0.4)
threshold = 0.4
y_pred_custom = (y_probs >= threshold).astype(int)

# Print classification report
from sklearn.metrics import classification_report
print(f"Classification Report (Threshold = {threshold}):")
print(classification_report(y_test, y_pred_custom))


Classification Report (Threshold = 0.4):
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        72
           1       0.98      0.95      0.96        42

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

