# Breast Cancer Diagnostic Prediction Program
## Purpose
Building a Machine Learning Model to Predict Tumor Benignity and Malignancy Based on the Wisconsin Breast Cancer Dataset (569 Samples, 30 Features)

## Technology Stack
- Python 3.8
- Main library: pandas, sklearn, matplotlib

## Key findings
| Indicator | Value |
|---|---|
| Accuracy | 96.5% |
| Optimal Feature | worst concave_points |

## Quick Use
```python
# Run for complete analysis
!python breast_cancer_analysis.py
```

> Data source：UCI Machine Learning Repository

In [None]:
import pandas as pd
data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
print("First five lines: \n", data.head())

# 1. Setting
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# 2. Data cleaning
data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
data = data.drop(['id', 'Unnamed: 32'], axis=1)
data['diagnosis'] = data['diagnosis'].map({'M':1, 'B':0})

# 3. Visualizaiton
plt.figure(figsize=(8,5))
sns.boxplot(x='diagnosis', y='radius_mean', data=data)
plt.title("The relationship between tumor radius and grade malignancy（0=Benign，1=Malignant）")
plt.show()

# 4. Modeling
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()
model.fit(X_train, y_train)
print(f"Model accuracy：{model.score(X_test, y_test):.2%}")

# 5. Feature Importance Analysis
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\n5 Most Important Features:\n", importances.head())

# 6. ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Ensure randomness: resplit data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Prevent data leakage
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

model = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)
model.fit(X_train, y_train)

# Re-evaluate
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
auc = roc_auc_score(y_test, y_pred_prob)

print(f"corrected AUC: {auc:.4f}")

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.savefig('roc_curve.png')
plt.show()
