In [9]:
# Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

print("=== BREAST CANCER PREDICTION USING LOGISTIC REGRESSION ===")
print("Libraries imported successfully!")

=== BREAST CANCER PREDICTION USING LOGISTIC REGRESSION ===
Libraries imported successfully!


In [10]:
# ============================================================================
# STEP 1: DATA COLLECTION & LOADING
# ============================================================================

def load_breast_cancer_data():
    """
    Load the Breast Cancer dataset from sklearn and convert to pandas DataFrame
    """
    from sklearn.datasets import load_breast_cancer

    # Load the dataset
    breast_cancer_dataset = load_breast_cancer()

    # Create DataFrame with feature names as columns
    data_frame = pd.DataFrame(breast_cancer_dataset.data, columns=breast_cancer_dataset.feature_names)

    # Add target column (label)
    data_frame['label'] = breast_cancer_dataset.target

    print(" Dataset loaded successfully!")
    print(f"Dataset shape: {data_frame.shape}")

    return data_frame, breast_cancer_dataset

# Load the data
data_frame, dataset_info = load_breast_cancer_data()

print("\n" + "="*60)
print("STEP 1 COMPLETED: Data Collection & Loading")
print("="*60)

 Dataset loaded successfully!
Dataset shape: (569, 31)

STEP 1 COMPLETED: Data Collection & Loading


In [11]:
# ============================================================================
# STEP 2: EXPLORATORY DATA Analysis (EDA)
# ============================================================================

print("\n EXPLORATORY DATA ANALYSIS")
print("-" * 40)

# Display first five rows
print("\n First 5 rows of the dataset:")
print(data_frame.head())

# Display last five rows
print("\n Last 5 rows of the dataset:")
print(data_frame.tail())

# Dataset shape
print(f"\n Dataset Shape: {data_frame.shape}")
print(f"   - Number of samples: {data_frame.shape[0]}")
print(f"   - Number of features: {data_frame.shape[1] - 1}")  # -1 for target column

# Dataset info
print("\n Dataset Information:")
print(data_frame.info())

# Check for missing values
print("\n Missing Values Check:")
missing_values = data_frame.isnull().sum()
print(f"Total missing values: {missing_values.sum()}")
if missing_values.sum() == 0:
    print(" No missing values found!")
else:
    print(" Missing values detected:")
    print(missing_values[missing_values > 0])

# Summary statistics
print("\n Statistical Summary:")
print(data_frame.describe())

# Target variable analysis
print("\n Target Variable Distribution:")
target_distribution = data_frame['label'].value_counts()
print(target_distribution)
print(f"\nTarget variable mapping:")
print(f"0 → Malignant (Cancerous): {target_distribution[0]} samples")
print(f"1 → Benign (Non-cancerous): {target_distribution[1]} samples")

# Calculate percentages
total_samples = len(data_frame)
malignant_percent = (target_distribution[0] / total_samples) * 100
benign_percent = (target_distribution[1] / total_samples) * 100
print(f"\nPercentage Distribution:")
print(f"Malignant: {malignant_percent:.1f}%")
print(f"Benign: {benign_percent:.1f}%")

print("\n" + "="*60)
print("STEP 2 COMPLETED: Exploratory Data Analysis")
print("="*60)


 EXPLORATORY DATA ANALYSIS
----------------------------------------

 First 5 rows of the dataset:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1

In [12]:
# ============================================================================
# STEP 3: DATA PREPROCESSING
# ============================================================================

print("\n DATA PREPROCESSING")
print("-" * 30)

# Separate features (X) and target variable (Y)
X = data_frame.drop(columns='label', axis=1)
Y = data_frame['label']

print(f" Features (X) shape: {X.shape}")
print(f" Target (Y) shape: {Y.shape}")

print(f"\n Feature columns:")
for i, col in enumerate(X.columns, 1):
    print(f"{i:2d}. {col}")

print("\n" + "="*60)
print("STEP 3 COMPLETED: Data Preprocessing")
print("="*60)


 DATA PREPROCESSING
------------------------------
 Features (X) shape: (569, 30)
 Target (Y) shape: (569,)

 Feature columns:
 1. mean radius
 2. mean texture
 3. mean perimeter
 4. mean area
 5. mean smoothness
 6. mean compactness
 7. mean concavity
 8. mean concave points
 9. mean symmetry
10. mean fractal dimension
11. radius error
12. texture error
13. perimeter error
14. area error
15. smoothness error
16. compactness error
17. concavity error
18. concave points error
19. symmetry error
20. fractal dimension error
21. worst radius
22. worst texture
23. worst perimeter
24. worst area
25. worst smoothness
26. worst compactness
27. worst concavity
28. worst concave points
29. worst symmetry
30. worst fractal dimension

STEP 3 COMPLETED: Data Preprocessing


In [13]:
# ============================================================================
# STEP 4: SPLITTING THE DATASET
# ============================================================================

print("\n DATASET SPLITTING")
print("-" * 25)

# Split the dataset (80% training, 20% testing)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.2,
    random_state=2,
    stratify=Y  # Ensure balanced split
)

print(f" Dataset split successfully!")
print(f"Training set: {X_train.shape[0]} samples ({(X_train.shape[0]/len(data_frame)*100):.1f}%)")
print(f"Testing set: {X_test.shape[0]} samples ({(X_test.shape[0]/len(data_frame)*100):.1f}%)")

# Check class distribution in splits
print(f"\n Training set distribution:")
train_dist = pd.Series(Y_train).value_counts().sort_index()
print(f"Malignant (0): {train_dist[0]} samples")
print(f"Benign (1): {train_dist[1]} samples")

print(f"\n Testing set distribution:")
test_dist = pd.Series(Y_test).value_counts().sort_index()
print(f"Malignant (0): {test_dist[0]} samples")
print(f"Benign (1): {test_dist[1]} samples")

print("\n" + "="*60)
print("STEP 4 COMPLETED: Dataset Splitting")
print("="*60)


 DATASET SPLITTING
-------------------------
 Dataset split successfully!
Training set: 455 samples (80.0%)
Testing set: 114 samples (20.0%)

 Training set distribution:
Malignant (0): 170 samples
Benign (1): 285 samples

 Testing set distribution:
Malignant (0): 42 samples
Benign (1): 72 samples

STEP 4 COMPLETED: Dataset Splitting


In [14]:
# ============================================================================
# STEP 5: MODEL TRAINING
# ============================================================================

print("\n MODEL TRAINING")
print("-" * 20)

# Initialize and train the logistic regression model with improved parameters
model = LogisticRegression(
    max_iter=2000,
    random_state=2,
    solver='liblinear'
)

print("Training the Logistic Regression model...")
model.fit(X_train, Y_train)

print(" Model training completed successfully!")
print(f"Model type: {type(model).__name__}")
print(f"Model parameters: {model.get_params()}")

print("\n" + "="*60)
print("STEP 5 COMPLETED: Model Training")
print("="*60)


 MODEL TRAINING
--------------------
Training the Logistic Regression model...
 Model training completed successfully!
Model type: LogisticRegression
Model parameters: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 2000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 2, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

STEP 5 COMPLETED: Model Training


In [15]:
# ============================================================================
# STEP 6: MODEL EVALUATION
# ============================================================================

print("\n MODEL EVALUATION")
print("-" * 20)

# Predictions on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

print(f" Accuracy on training data: {training_data_accuracy:.4f} ({training_data_accuracy*100:.2f}%)")

# Predictions on testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

print(f" Accuracy on test data: {test_data_accuracy:.4f} ({test_data_accuracy*100:.2f}%)")

# Additional evaluation metrics
print(f"\n Detailed Classification Report (Test Data):")
print(classification_report(Y_test, X_test_prediction, target_names=['Malignant', 'Benign']))

# Confusion Matrix
print(f" Confusion Matrix (Test Data):")
cm = confusion_matrix(Y_test, X_test_prediction)
print(cm)
print(f"\nConfusion Matrix Interpretation:")
print(f"True Negatives (Correctly predicted Malignant): {cm[0,0]}")
print(f"False Positives (Incorrectly predicted Benign): {cm[0,1]}")
print(f"False Negatives (Incorrectly predicted Malignant): {cm[1,0]}")
print(f"True Positives (Correctly predicted Benign): {cm[1,1]}")

# Model performance summary
print(f"\n MODEL PERFORMANCE SUMMARY:")
print(f"   Training Accuracy: {training_data_accuracy*100:.2f}%")
print(f"   Testing Accuracy: {test_data_accuracy*100:.2f}%")
overfitting_check = training_data_accuracy - test_data_accuracy
print(f"   Overfitting Check: {overfitting_check:.4f}")
if overfitting_check < 0.05:
    print("    Model shows good generalization (minimal overfitting)")
else:
    print("    Model might be overfitting")

print("\n" + "="*60)
print("STEP 6 COMPLETED: Model Evaluation")
print("="*60)


 MODEL EVALUATION
--------------------
 Accuracy on training data: 0.9538 (95.38%)
 Accuracy on test data: 0.9737 (97.37%)

 Detailed Classification Report (Test Data):
              precision    recall  f1-score   support

   Malignant       0.95      0.98      0.96        42
      Benign       0.99      0.97      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

 Confusion Matrix (Test Data):
[[41  1]
 [ 2 70]]

Confusion Matrix Interpretation:
True Negatives (Correctly predicted Malignant): 41
False Positives (Incorrectly predicted Benign): 1
False Negatives (Incorrectly predicted Malignant): 2
True Positives (Correctly predicted Benign): 70

 MODEL PERFORMANCE SUMMARY:
   Training Accuracy: 95.38%
   Testing Accuracy: 97.37%
   Overfitting Check: -0.0198
    Model shows good generalization (minimal overfitting)

STEP 6 COMPLETED: Model Evaluation


In [16]:
# ============================================================================
# STEP 7: BUILDING A PREDICTIVE SYSTEM
# ============================================================================

print("\n PREDICTIVE SYSTEM")
print("-" * 25)

def predict_breast_cancer(input_data):
    """
    Predict whether breast cancer is malignant or benign

    Args:
        input_data: tuple or list of 30 feature values

    Returns:
        prediction result and probability
    """
    # Convert input data to numpy array and reshape
    input_data_as_numpy_array = np.asarray(input_data)
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    # Create DataFrame with proper feature names to avoid warnings
    input_df = pd.DataFrame(input_data_reshaped, columns=X.columns)

    # Make prediction
    prediction = model.predict(input_df)
    prediction_probability = model.predict_proba(input_df)

    # Interpret result
    if prediction[0] == 0:
        result = "Malignant (Cancerous)"
        confidence = prediction_probability[0][0] * 100
    else:
        result = "Benign (Non-cancerous)"
        confidence = prediction_probability[0][1] * 100

    return result, confidence, prediction[0]

# Test with sample data (provided in requirements)
print(" Testing with sample data:")
input_data = (13.54, 14.36, 87.46, 566.3, 0.09779, 0.08129, 0.06664, 0.04781, 0.1885, 0.05766,
              0.2699, 0.7886, 2.058, 23.56, 0.008462, 0.0146, 0.02387, 0.01315, 0.0198, 0.0023,
              15.11, 19.26, 99.7, 711.2, 0.144, 0.1773, 0.239, 0.1288, 0.2977, 0.07259)

print(f"Input features: {len(input_data)} values")
print(f"Sample input: {input_data[:5]}... (showing first 5 values)")

# Make prediction
result, confidence, prediction_code = predict_breast_cancer(input_data)

print(f"\n PREDICTION RESULT:")
print(f"   Diagnosis: {result}")
print(f"   Confidence: {confidence:.2f}%")
print(f"   Prediction Code: {prediction_code}")

# Test with a few samples from test set
print(f"\n Testing with random samples from test set:")
test_samples = 3
for i in range(test_samples):
    sample_idx = np.random.randint(0, len(X_test))
    sample_data = X_test.iloc[sample_idx:sample_idx+1]  # Keep as DataFrame
    actual_label = Y_test.iloc[sample_idx]

    # Use DataFrame directly to avoid warnings
    prediction = model.predict(sample_data)
    prediction_probability = model.predict_proba(sample_data)

    predicted_label = prediction[0]
    if predicted_label == 0:
        result = "Malignant (Cancerous)"
        confidence = prediction_probability[0][0] * 100
    else:
        result = "Benign (Non-cancerous)"
        confidence = prediction_probability[0][1] * 100

    actual_result = "Benign (Non-cancerous)" if actual_label == 1 else "Malignant (Cancerous)"

    print(f"\n   Sample {i+1}:")
    print(f"   Actual: {actual_result}")
    print(f"   Predicted: {result}")
    print(f"   Confidence: {confidence:.2f}%")
    print(f"   Match: {' Correct' if predicted_label == actual_label else ' Incorrect'}")

print("\n" + "="*60)
print("STEP 7 COMPLETED: Predictive System")
print("="*60)


 PREDICTIVE SYSTEM
-------------------------
 Testing with sample data:
Input features: 30 values
Sample input: (13.54, 14.36, 87.46, 566.3, 0.09779)... (showing first 5 values)

 PREDICTION RESULT:
   Diagnosis: Benign (Non-cancerous)
   Confidence: 97.82%
   Prediction Code: 1

 Testing with random samples from test set:

   Sample 1:
   Actual: Benign (Non-cancerous)
   Predicted: Benign (Non-cancerous)
   Confidence: 99.83%
   Match:  Correct

   Sample 2:
   Actual: Malignant (Cancerous)
   Predicted: Malignant (Cancerous)
   Confidence: 70.71%
   Match:  Correct

   Sample 3:
   Actual: Benign (Non-cancerous)
   Predicted: Benign (Non-cancerous)
   Confidence: 99.98%
   Match:  Correct

STEP 7 COMPLETED: Predictive System
