# Day 01 - Project 2

## Logistic Regression on Breast Cancer Dataset

### Step 1: Import Required Libraries
We import necessary libraries to load data, preprocess, build the model, and evaluate it.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, RocCurveDisplay

### Step 2: Load the Dataset
Load the Breast Cancer dataset from sklearn.

In [3]:
cancer = load_breast_cancer()
print(cancer.keys())
print(cancer.data.shape)
print(cancer.target_names)
print(cancer.feature_names)
#print(cancer.DESCR)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
(569, 30)
['malignant' 'benign']
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


### Step 3: Create a DataFrame
We create a DataFrame for better exploration and handling.

In [None]:
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target
X.head()

### Step 4: Split Data into Train and Test Sets
We split the dataset to evaluate model generalization.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Step 5: Build and Train the Logistic Regression Model

In [None]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

### Step 6: Predict on Test Set
Use the trained model to make predictions.

In [None]:
y_pred = model.predict(X_test)

### Step 7: Evaluate the Model
We evaluate using Accuracy, Precision, Recall, F1 Score, and ROC AUC.

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

### Step 8: Confusion Matrix
We visualize the confusion matrix.

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)
plt.figure(figsize=(6,4))
plt.imshow(conf_mat, cmap='Blues')
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### Step 9: Plot ROC Curve
We visualize the trade-off between sensitivity and specificity.

In [None]:
RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.title('ROC Curve')
plt.show()

### 📚 Final Notes
- Logistic Regression is great for binary classification tasks.
- Focus on Precision or Recall depending on business needs.
- High Recall is crucial in medical diagnosis to avoid missing positive cases (malignant tumors).