In [1]:
import pandas as pd

# Read the wdbc.names file to get column information
with open('/content/wdbc.names', 'r') as f:
    names_content = f.readlines()

#  define the column names based on the typical structure of the WDBC dataset
column_names = [
    'ID',
    'Diagnosis',
    'radius_mean',
    'texture_mean',
    'perimeter_mean',
    'area_mean',
    'smoothness_mean',
    'compactness_mean',
    'concavity_mean',
    'concave points_mean',
    'symmetry_mean',
    'fractal_dimension_mean',
    'radius_se',
    'texture_se',
    'perimeter_se',
    'area_se',
    'smoothness_se',
    'compactness_se',
    'concavity_se',
    'concave points_se',
    'symmetry_se',
    'fractal_dimension_se',
    'radius_worst',
    'texture_worst',
    'perimeter_worst',
    'area_worst',
    'smoothness_worst',
    'compactness_worst',
    'concavity_worst',
    'concave points_worst',
    'symmetry_worst',
    'fractal_dimension_worst'
]

# Load the wdbc.data dataset using the defined column names
df = pd.read_csv('/content/wdbc.data', header=None, names=column_names)

print("Dataset loaded successfully with custom column names.")
print(df.head())

Dataset loaded successfully with custom column names.
         ID Diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Convert 'Diagnosis' column to numerical format (M=1, B=0)
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0})

# Define target variable (y) and features (X)
y = df['Diagnosis']
X = df.drop(['ID', 'Diagnosis'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply StandardScaler to the feature columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print("Target variable encoded, data split, and features scaled successfully.")
print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of X_test_scaled:", X_test_scaled.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
print("\nFirst 5 rows of scaled training features:")
print(X_train_scaled.head())
print("\nFirst 5 rows of training target variable:")
print(y_train.head())

Target variable encoded, data split, and features scaled successfully.
Shape of X_train_scaled: (455, 30)
Shape of X_test_scaled: (114, 30)
Shape of y_train: (455,)
Shape of y_test: (114,)

First 5 rows of scaled training features:
     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
68     -1.440753     -0.435319       -1.362085  -1.139118         0.780573   
181     1.974096      1.733026        2.091672   1.851973         1.319843   
63     -1.399982     -1.249622       -1.345209  -1.109785        -1.332645   
248    -0.981797      1.416222       -0.982587  -0.866944         0.059390   
60     -1.117700     -1.010259       -1.125002  -0.965942         1.269511   

     compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
68           0.718921        2.823135            -0.119150       1.092662   
181          3.426275        2.013112             2.665032       2.127004   
63          -0.307355       -0.365558            -0.696502       1.9

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

log_reg_model = LogisticRegression(random_state=42, solver='liblinear')

# Fit the model to the scaled training data
log_reg_model.fit(X_train_scaled, y_train)

print("Logistic Regression model trained successfully.")

y_train_pred = log_reg_model.predict(X_train_scaled)

y_test_pred = log_reg_model.predict(X_test_scaled)

train_accuracy = accuracy_score(y_train, y_train_pred)
train_error = 1 - train_accuracy
print(f"\nTraining Error: {train_error:.4f}")

test_accuracy = accuracy_score(y_test, y_test_pred)
test_error = 1 - test_accuracy
print(f"Test Error: {test_error:.4f}")

print(f"\nTest Set Performance:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_test_pred):.4f}")

conf_matrix = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(conf_matrix)


Logistic Regression model trained successfully.

Training Error: 0.0132
Test Error: 0.0263

Test Set Performance:
Accuracy: 0.9737
Precision: 0.9762
Recall: 0.9535
F1-Score: 0.9647

Confusion Matrix (Test Set):
[[70  1]
 [ 2 41]]


In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X_train_scaled, y_train)

print("Decision Tree Classifier model trained successfully.")

y_train_pred_dt = dt_model.predict(X_train_scaled)

y_test_pred_dt = dt_model.predict(X_test_scaled)

# Calculate training accuracy and error
train_accuracy_dt = accuracy_score(y_train, y_train_pred_dt)
train_error_dt = 1 - train_accuracy_dt
print(f"\nDecision Tree Training Error: {train_error_dt:.4f}")

# Calculate test accuracy and error
test_accuracy_dt = accuracy_score(y_test, y_test_pred_dt)
test_error_dt = 1 - test_accuracy_dt
print(f"Decision Tree Test Error: {test_error_dt:.4f}")

print(f"\nTest Set Performance (Decision Tree):")
print(f"Accuracy: {test_accuracy_dt:.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred_dt):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred_dt):.4f}")
print(f"F1-Score: {f1_score(y_test, y_test_pred_dt):.4f}")

conf_matrix_dt = confusion_matrix(y_test, y_test_pred_dt)
print("\nConfusion Matrix (Test Set - Decision Tree):")
print(conf_matrix_dt)

Decision Tree Classifier model trained successfully.

Decision Tree Training Error: 0.0000
Decision Tree Test Error: 0.0526

Test Set Performance (Decision Tree):
Accuracy: 0.9474
Precision: 0.9302
Recall: 0.9302
F1-Score: 0.9302

Confusion Matrix (Test Set - Decision Tree):
[[68  3]
 [ 3 40]]



#### Overfitting and Underfitting Analysis:

**Logistic Regression**: The training error (1.32%) is very close to the test error (2.63%). This indicates a good balance between bias and variance. The model generalizes well to unseen data, showing no significant signs of overfitting or underfitting. Its performance on the test set is robust across accuracy, precision, recall, and F1-score, suggesting it captures the underlying patterns without memorizing the training data.

**Decision Tree Classifier**: The training error is 0.00% (perfect accuracy on the training set), while the test error is 5.26%. This significant gap between training and test performance is a clear indication of **overfitting**. The model has learned the training data too well, including its noise, which has led to a reduction in its ability to generalize to new, unseen data. While still performing reasonably well on the test set, its performance is notably lower than the Logistic Regression model, especially concerning false positives and false negatives.

#### Conclusion for Logistic Regression Model:

The Logistic Regression model demonstrated strong and consistent performance with low training (1.32%) and test (2.63%) errors, indicating excellent generalization and no significant overfitting. Its high accuracy (97.37%) and robust precision/recall make it a reliable choice for this medical diagnosis task, especially given the criticality of false negatives.

#### Conclusion for Decision Tree Classifier:

With a perfect 0% training error but a higher 5.26% test error, the Decision Tree Classifier showed clear signs of overfitting, memorizing training data rather than generalizing. Its performance on unseen data was weaker than Logistic Regression, highlighting the need for hyperparameter tuning (e.g., pruning or limiting depth) to improve generalization.


*   **Relevant Machine Learning Issues**:
    *   **Feature Scaling**: Was successfully applied, which is crucial for algorithms like Logistic Regression.
    *   **Class Imbalance**: A mild imbalance was noted (approximately 63% Benign, 37% Malignant in training), highlighting the importance of metrics beyond accuracy.
    *   **Feature Correlation**: High correlations among features in medical datasets were identified as a potential issue, though not explicitly handled beyond scaling.


