# Breast Cancer Detection - Data Exploration

This notebook provides exploratory data analysis for the Wisconsin Breast Cancer dataset.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_preprocessing import load_data, get_feature_statistics

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
X, y = load_data()
print(f"Dataset shape: {X.shape}")
print(f"\nFirst few rows:")
X.head()

## Target Distribution

In [None]:
print("Class distribution:")
print(y.value_counts())
print(f"\nClass distribution (percentage):")
print(y.value_counts(normalize=True) * 100)

# Plot
plt.figure(figsize=(8, 5))
y.value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class (0: Malignant, 1: Benign)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Feature Statistics

In [None]:
print("Feature Statistics:")
X.describe().round(2)

## Missing Values

In [None]:
print("Missing values:")
print(X.isnull().sum().sum())
print("\nNo missing values found!" if X.isnull().sum().sum() == 0 else "Missing values detected")

## Feature Correlations

In [None]:
# Add target to calculate correlations
X_with_target = X.copy()
X_with_target['target'] = y

# Correlation with target
correlations = X_with_target.corr()['target'].drop('target').sort_values(ascending=False)
print("Top 10 features correlated with target:")
print(correlations.head(10))

# Plot
plt.figure(figsize=(10, 6))
correlations.head(15).plot(kind='barh')
plt.title('Feature Correlation with Target')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()

## Feature Distributions

In [None]:
# Plot distributions of first 6 features
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.ravel()

for idx, col in enumerate(X.columns[:6]):
    axes[idx].hist(X[col], bins=30, edgecolor='black')
    axes[idx].set_title(col)
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Summary

Key findings from the exploratory analysis:
- The dataset has 569 samples with 30 features
- Good class balance: 357 benign (63%) and 212 malignant (37%)
- No missing values
- Features show varying scales and distributions
- Several features show strong correlation with the target