In [12]:
%%writefile task1_cancer_classification.py
import argparse
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix
)

# -----------------------------
# Utility: Metrics Printer
# -----------------------------
def evaluate(model, X_train, y_train, X_test, y_test, name):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_error = 1 - accuracy_score(y_train, y_train_pred)
    test_error = 1 - accuracy_score(y_test, y_test_pred)

    acc = accuracy_score(y_test, y_test_pred)
    prec = precision_score(y_test, y_test_pred)
    rec = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    cm = confusion_matrix(y_test, y_test_pred)

    print(f"\n===== {name} =====")
    print(f"Train Error: {train_error:.4f}")
    print(f"Test Error : {test_error:.4f}")
    print(f"Accuracy  : {acc:.4f}")
    print(f"Precision : {prec:.4f}")
    print(f"Recall    : {rec:.4f}")
    print(f"F1-score  : {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)

    return train_error, test_error


# -----------------------------
# Main
# -----------------------------
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", required=True, help="Path to data.csv")
    parser.add_argument("--test_size", type=float, default=0.2)
    parser.add_argument("--model", choices=["logistic", "tree", "both"], default="both")
    args = parser.parse_args()

    # Dataset has no headers
    df = pd.read_csv(args.data, header=None)

    # Column 1 is label: M or B
    y = df[1].map({"M": 1, "B": 0})
    X = df.drop(columns=[0, 1])  # drop ID and label

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=42, stratify=y
    )

    # Scaling (important for Logistic Regression)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    conclusions = []

    # ---------------- Logistic Regression ----------------
    if args.model in ["logistic", "both"]:
        logreg = LogisticRegression(max_iter=500)
        logreg.fit(X_train_scaled, y_train)

        tr_err, te_err = evaluate(
            logreg,
            X_train_scaled, y_train,
            X_test_scaled, y_test,
            "Logistic Regression"
        )

        if tr_err < te_err:
            conclusions.append("Logistic Regression shows slight overfitting.")
        else:
            conclusions.append("Logistic Regression generalizes well with similar train and test error.")

    # ---------------- Decision Tree ----------------
    if args.model in ["tree", "both"]:
        tree = DecisionTreeClassifier(random_state=42)
        tree.fit(X_train, y_train)

        tr_err, te_err = evaluate(
            tree,
            X_train, y_train,
            X_test, y_test,
            "Decision Tree"
        )

        if tr_err < te_err:
            conclusions.append("Decision Tree shows strong overfitting (very low train error, higher test error).")
        elif tr_err > te_err:
            conclusions.append("Decision Tree may be underfitting.")
        else:
            conclusions.append("Decision Tree has moderate generalization.")

    # ---------------- Final Conclusion ----------------
    print("\n===== Conclusion =====")
    print(
        "The Logistic Regression model benefits from feature scaling and typically shows "
        "similar training and test error, indicating good generalization and low variance. "
        "It is a linear model, so it may slightly underfit complex patterns but remains stable."
    )

    print(
        "\nDecision Trees can fit training data very closely, often achieving near-zero training error. "
        "If test error is much higher, this indicates overfitting due to high model variance. "
        "This happens because trees memorize noise and small patterns in the training data."
    )

    print(
        "\nRelevant ML issues in this problem:\n"
        "1. Feature scaling: Logistic Regression relies on distance-based optimization, so unscaled features can "
        "cause slow convergence and poor coefficients.\n"
        "2. Feature correlation: Many tumor measurements are highly correlated, which can affect linear models.\n"
        "3. Data leakage risk: Scaling must be fit only on training data, not entire dataset.\n"
        "4. Class imbalance: Benign cases are more than malignant, so accuracy alone can be misleading; "
        "precision and recall are important."
    )


if __name__ == "__main__":
    main()


Overwriting task1_cancer_classification.py


In [13]:
!python task1_cancer_classification.py --data wdbc.data --test_size 0.2 --model both



===== Logistic Regression =====
Train Error: 0.0132
Test Error : 0.0351
Accuracy  : 0.9649
Precision : 0.9750
Recall    : 0.9286
F1-score  : 0.9512
Confusion Matrix:
[[71  1]
 [ 3 39]]

===== Decision Tree =====
Train Error: 0.0000
Test Error : 0.0702
Accuracy  : 0.9298
Precision : 0.9048
Recall    : 0.9048
F1-score  : 0.9048
Confusion Matrix:
[[68  4]
 [ 4 38]]

===== Conclusion =====
The Logistic Regression model benefits from feature scaling and typically shows similar training and test error, indicating good generalization and low variance. It is a linear model, so it may slightly underfit complex patterns but remains stable.

Decision Trees can fit training data very closely, often achieving near-zero training error. If test error is much higher, this indicates overfitting due to high model variance. This happens because trees memorize noise and small patterns in the training data.

Relevant ML issues in this problem:
1. Feature scaling: Logistic Regression relies on distance-bas