# Task: Load the Breast Cancer dataset and create a basic logistic regression model without preprocessing.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import pandas as pd

# Load the dataset
data = load_breast_cancer()
X = data.data  # Features
y = data.target  # Target (0 = malignant, 1 = benign)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(cm)

Accuracy: 0.9561
Confusion Matrix:
[[39  4]
 [ 1 70]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



### Explanation:
- load_breast_cancer() loads a binary classification dataset
- X contains 30 features about tumor characteristics
- y contains binary labels (0 = malignant, 1 = benign)
- LogisticRegression is used for binary classification
- confusion_matrix shows:

    [True Negative, False Positive]

    [False Negative, True Positive]



## Notes:
1- Logistic Regression is for classification, Linear Regression for regression

2- Outputs probabilities between 0 and 1 using sigmoid function

3- Perfect for binary classification problems

4- Interpretable and fast for small to medium datasets

5- Requires feature scaling for best performance

6- Can be extended to multi-class problems

# Task: Calculate and interpret precision, recall, and F1-score from the confusion matrix.

In [None]:
from sklearn.metrics import classification_report

# Calculate evaluation metrics
tn, fp, fn, tp = cm.ravel()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Using sklearn's built-in function
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Malignant', 'Benign']))

Precision: 0.9459
Recall: 0.9859
F1-Score: 0.9655

Detailed Classification Report:
              precision    recall  f1-score   support

   Malignant       0.97      0.91      0.94        43
      Benign       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114




- Precision: How many predicted positives are actually positive
- Recall: How many actual positives are correctly predicted
- F1-Score: Harmonic mean of precision and recall
- Macro Average: Simple average of both classes (0.97 + 0.97) / 2 = 0.97

- Weighted Average: Average weighted by support (43×0.97 + 71×0.97) / 114 = 0.97

Interpretation:
- High precision: Few false alarms
- High recall: Misses few positive cases
- Good model should have both high precision and recall


Key Takeaways


1.   Precision: Quality of positive predictions → "How trustworthy are our 'yes' answers?"

2. Recall: Coverage of actual positives → "How many actual 'yes' cases did we find?"

3. F1-Score: Balanced measure when both matter equally

4. Context Matters: In healthcare, recall is often more important than precision

5. Trade-offs: Improving one metric often worsens another


For the breast cancer model(medical in general):

  1. High recall means you're good at detecting actual cancer cases (safety first!)

  2. High precision means you're not causing unnecessary worry with false alarms

  3. Your balanced F1-score suggests good overall performance for medical use

# Task: Apply logistic regression to classify three types of iris flowers.

In [None]:
from sklearn.datasets import load_iris

# Load iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Split the data
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.3, random_state=42
)

# Create multi-class logistic regression model
# multi_class='multinomial' enables Softmax for multi-class classification
model_iris = LogisticRegression(multi_class='multinomial', random_state=42)
model_iris.fit(X_train_iris, y_train_iris)

# Predictions
y_pred_iris = model_iris.predict(X_test_iris)

# Evaluate
accuracy_iris = accuracy_score(y_test_iris, y_pred_iris)
cm_iris = confusion_matrix(y_test_iris, y_pred_iris)

print(f"Iris Dataset Accuracy: {accuracy_iris:.4f}")
print("Iris Confusion Matrix:")
print(cm_iris)
print(f"Class names: {iris.target_names}")



Iris Dataset Accuracy: 1.0000
Iris Confusion Matrix:
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
Class names: ['setosa' 'versicolor' 'virginica']



##Multi-class Classification:

- Softmax function generalizes sigmoid for multiple classes

  Step 1: exponentiate all scores → [e², e¹, e⁰·¹] = [7.39, 2.72, 1.11]

  Step 2: sum all exponentials → 7.39 + 2.72 + 1.11 = 11.22
  
  Step 3: divide each by sum → [7.39/11.22, 2.72/11.22, 1.11/11.22]
- multi_class='multinomial' uses Softmax regression
- Model outputs probabilities for each class
- Final prediction is class with highest probability

###The confusion matrix for 3 classes shows:
- Diagonal: Correct predictions for each class
- Off-diagonal: Misclassifications between classes


# Task: Demonstrate the importance of feature scaling for logistic regression.

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Create sample data with different scales
np.random.seed(42)
X_uneven = np.column_stack([
    np.random.normal(100, 10, 100),  # Large scale (mean=100)
    np.random.normal(0, 1, 100)      # Small scale (mean=0)
])
y_uneven = (X_uneven[:, 0] + X_uneven[:, 1] > 100).astype(int)

# Split data
X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(
    X_uneven, y_uneven, test_size=0.3, random_state=42
)

# Model without scaling
model_no_scale = LogisticRegression(random_state=42)
model_no_scale.fit(X_train_u, y_train_u)
accuracy_no_scale = accuracy_score(y_test_u, model_no_scale.predict(X_test_u))

# Model with scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_u)
X_test_scaled = scaler.transform(X_test_u)

model_scaled = LogisticRegression(random_state=42)
model_scaled.fit(X_train_scaled, y_train_u)
accuracy_scaled = accuracy_score(y_test_u, model_scaled.predict(X_test_scaled))

print(f"Accuracy without scaling: {accuracy_no_scale:.4f}")
print(f"Accuracy with scaling: {accuracy_scaled:.4f}")
print(f"Improvement: {accuracy_scaled - accuracy_no_scale:.4f}")

Accuracy without scaling: 0.9667
Accuracy with scaling: 1.0000
Improvement: 0.0333


## Why Scaling Matters:
1. Logistic regression uses gradient descent
2. Features with larger scales dominate the optimization
3. Scaling ensures all features contribute equally
4. StandardScaler transforms data to mean=0, std=1

StandardScaler formula:
    z = (x - mean) / std

Always:

- fit_transform on training data
- transform on test data (using training parameters)


## Task: Create a complete pipeline for the breast cancer dataset with proper preprocessing.

In [None]:
from sklearn.pipeline import Pipeline

# Create a pipeline that includes scaling and logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),           # Step 1: Scale features
    ('classifier', LogisticRegression(      # Step 2: Classification
        random_state=42,
        max_iter=1000  # Ensure convergence
    ))
])

# Use the pipeline on breast cancer data
pipeline.fit(X_train, y_train)
y_pred_pipeline = pipeline.predict(X_test)

# Evaluate
accuracy_pipeline = accuracy_score(y_test, y_pred_pipeline)
cm_pipeline = confusion_matrix(y_test, y_pred_pipeline)

print(f"Pipeline Accuracy: {accuracy_pipeline:.4f}")
print("Pipeline Confusion Matrix:")
print(cm_pipeline)

# Compare with original model
print(f"\nImprovement over non-scaled: {accuracy_pipeline - accuracy:.4f}")