# Logistic Regression on Breast Cancer Dataset


In [53]:

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc


# Load the Dataset


In [54]:
# Step 2: Load the Dataset
# We use sklearn's built-in breast cancer dataset 
df = pd.read_csv(r"C:\Users\ansar\OneDrive\Documents\Downloads\breast+cancer+wisconsin+diagnostic\wdbc.csv")


# Exploratory Data Analysis (EDA)


In [55]:
# Exploratory Data Analysis (EDA)
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()


Dataset Shape: (568, 32)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 32 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   842302    568 non-null    int64  
 1   M         568 non-null    object 
 2   17.99     568 non-null    float64
 3   10.38     568 non-null    float64
 4   122.8     568 non-null    float64
 5   1001      568 non-null    float64
 6   0.1184    568 non-null    float64
 7   0.2776    568 non-null    float64
 8   0.3001    568 non-null    float64
 9   0.1471    568 non-null    float64
 10  0.2419    568 non-null    float64
 11  0.07871   568 non-null    float64
 12  1.095     568 non-null    float64
 13  0.9053    568 non-null    float64
 14  8.589     568 non-null    float64
 15  153.4     568 non-null    float64
 16  0.006399  568 non-null    float64
 17  0.04904   568 non-null    float64
 18  0.05373   568 non-null    float64
 19  0.01587   568 non-null    float

In [56]:

print("\nSummary Statistics:")
print(df.describe())



Summary Statistics:
             842302       17.99       10.38       122.8         1001  \
count  5.680000e+02  568.000000  568.000000  568.000000   568.000000   
mean   3.042382e+07   14.120491   19.305335   91.914754   654.279754   
std    1.251246e+08    3.523416    4.288506   24.285848   351.923751   
min    8.670000e+03    6.981000    9.710000   43.790000   143.500000   
25%    8.692225e+05   11.697500   16.177500   75.135000   420.175000   
50%    9.061570e+05   13.355000   18.855000   86.210000   548.750000   
75%    8.825022e+06   15.780000   21.802500  103.875000   782.625000   
max    9.113205e+08   28.110000   39.280000  188.500000  2501.000000   

           0.1184      0.2776      0.3001      0.1471      0.2419  ...  \
count  568.000000  568.000000  568.000000  568.000000  568.000000  ...   
mean     0.096321    0.104036    0.088427    0.048746    0.181055  ...   
std      0.014046    0.052355    0.079294    0.038617    0.027319  ...   
min      0.052630    0.019380    0

In [57]:

print("\nMissing Values:")
print(df.isnull().sum())



Missing Values:
842302      0
M           0
17.99       0
10.38       0
122.8       0
1001        0
0.1184      0
0.2776      0
0.3001      0
0.1471      0
0.2419      0
0.07871     0
1.095       0
0.9053      0
8.589       0
153.4       0
0.006399    0
0.04904     0
0.05373     0
0.01587     0
0.03003     0
0.006193    0
25.38       0
17.33       0
184.6       0
2019        0
0.1622      0
0.6656      0
0.7119      0
0.2654      0
0.4601      0
0.1189      0
dtype: int64


## Visualize Class Distribution


In [58]:

# Visualize Class Distribution
sns.countplot(x='target', data=df)
plt.title("Target Class Distribution")
plt.show()


ValueError: Could not interpret value `target` for `x`. An entry with this name does not appear in `data`.

### Correlation Matrix


In [None]:

# Correlation Matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()


# Step 4: Feature Selection and Preprocessing


In [None]:

# Step 4: Feature Selection and Preprocessing
X = df.drop('target', axis=1)
y = df['target']


In [None]:

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Step 5: Model Training


In [None]:

# Step 5: Model Training
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)


In [None]:

# Step 6: Model Evaluation
# Predictions
y_pred = model.predict(X_test)


In [None]:

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:

# ROC Curve
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)


In [None]:

plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC)")
plt.legend(loc="lower right")
plt.show()


In [None]:

# Step 7: Conclusion
print("\nConclusion: Logistic Regression performs well on this dataset with high accuracy and good classification metrics.")
