#**DISEASES PREDICTION FROM MEDICAL DATA : BREAST CANCER PREDICTION**

**IMPORT LIBRARIES**

In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

**LOAD THE DATASET**

In [None]:
# 2. Load dataset
df = pd.read_csv("/content/Breast Cancer.csv")

# Show first few rows
print("Shape:", df.shape)
print(df.head())

**DATA PREPROCESSING**

In [None]:
# Drop unnecessary columns
df = df.drop(columns=["radius_mean"])

# Encode target column 'diagnosis' (M=1, B=0)
# le = LabelEncoder()
# df["diagnosis"] = le.fit_transform(df["diagnosis"])

In [None]:
# Split features & target
X = df.drop(columns=["compactness_mean"])
y = df["compactness_mean"]

In [None]:
#  Check missing values
print("\nMissing values:\n", df.isnull().sum())

#  Encode categorical columns if any
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))



In [None]:
# 5. Split into features & target
X = df.drop(columns=["diagnosis", "Unnamed: 32", "id"])
y = df["diagnosis"]

In [None]:
print(X)
print(y)

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Features shape:", X_scaled.shape)
print("Target distribution:\n", y.value_counts())

# Save the scaler
import joblib
joblib.dump(scaler, "scaler.pkl")

**SPLIT THE DATASET**

In [None]:
# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

**TRAIN WITH DIFFERENT MODELS**

In [None]:
# Train Logistic Regression
log_model = LogisticRegression(max_iter=1000, solver='liblinear')
log_model.fit(X_train, y_train)

# Predict
y_prd_log = log_model.predict(X_test)

In [None]:
# 8. Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [None]:
# 9. Train Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [None]:
# 10. Train Support Vector Machine (SVM)
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

In [None]:
# 11. Train Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Train the model (no use_label_encoder)
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# y_test : true values
# y_prd_log : predictions from logistic regression

mae = mean_absolute_error(y_test, y_prd_log)
mse = mean_squared_error(y_test, y_prd_log)
r2 = r2_score(y_test, y_prd_log)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)


In [None]:
# Include all models
models = {
    "Logistic Regression": (log_model, y_prd_log),
    "Random Forest": (rf_model, y_pred_rf),
    "Decision Tree": (dt_model, y_pred_dt),
    "SVM": (svm_model, y_pred_svm),
    "Naive Bayes": (nb_model, y_pred_nb),
    "XGBoost": (xgb_model, y_pred_xgb)   # ✅ Added XGBoost
}

# Loop through and evaluate
for name, (model, y_pred) in models.items():
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))  # Avoid warnings



In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Include all models
models = {
    "Logistic Regression": (log_model, y_prd_log),
    "Random Forest": (rf_model, y_pred_rf),
    "Decision Tree": (dt_model, y_pred_dt),
    "SVM": (svm_model, y_pred_svm),
    "Naive Bayes": (nb_model, y_pred_nb),
    "XGBoost": (xgb_model, y_pred_xgb)   # ✅ Added XGBoost
}

for name, (model, y_pred) in models.items():
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# Dictionary of models
models = {
    "Logistic Regression": (log_model, y_prd_log),
    "Random Forest": (rf_model, y_pred_rf),
    "Decision Tree": (dt_model, y_pred_dt),
    "SVM": (svm_model, y_pred_svm),
    "Naive Bayes": (nb_model, y_pred_nb),
    "XGBoost": (xgb_model, y_pred_xgb)   # ✅ Added XGBoost
}
# Collect accuracies
model_names = []
accuracies = []

for name, (model, y_pred) in models.items():
    acc = accuracy_score(y_test, y_pred)
    model_names.append(name)
    accuracies.append(acc)
    print(f"{name} Accuracy: {acc:.4f}")

# Plot bar graph
plt.figure(figsize=(8,5))
plt.bar(model_names, accuracies, color='skyblue')
plt.ylim(0, 1)  # Accuracy between 0 and 1
plt.ylabel('Accuracy')
plt.title('Model Accuracies Comparison')
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.02, f"{v:.2f}", ha='center')  # Show accuracy on top of bar
plt.show()


**VISUALIZE THROUGH GRAPHS**

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('/content/Breast Cancer.csv')

# Identify numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object', 'category', 'int']).columns

print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)


**1. Numerical Columns – Histogram**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Exclude 'id' and 'Unnamed: 32' from the numerical columns for plotting
num_cols_to_plot = [col for col in num_cols if col not in ['id', 'Unnamed: 32']]

for col in num_cols_to_plot:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], bins=20, kde=False, color='skyblue')
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

**2. Numerical Columns – Boxplot**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select a few features to visualize
features_to_plot = ["radius_mean", "texture_mean", "area_mean", "smoothness_mean"]

# Create boxplots
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x="diagnosis", y=feature, hue="diagnosis", data=df, palette="Set2", legend=False)
    plt.title(f"Boxplot of {feature} by Diagnosis")

plt.tight_layout()
plt.show()


**3 . Categorical Columns – Pie Chart**

In [None]:
import matplotlib.pyplot as plt

# Count values of diagnosis
diagnosis_counts = df["diagnosis"].value_counts()

# Pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    diagnosis_counts,
    labels=["Benign (0)", "Malignant (1)"],
    autopct="%1.1f%%",
    startangle=90,
    colors=["#66b3ff", "#ff9999"]
)
plt.title("Distribution of Diagnosis (Benign vs Malignant)")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Select only numeric columns
num_cols = df.select_dtypes(include=["float64", "int64"]).columns

# Encode diagnosis column to numeric for correlation calculation
le = LabelEncoder()
df['diagnosis_encoded'] = le.fit_transform(df['diagnosis'])

# Compute correlation matrix including the encoded diagnosis column
corr = df[num_cols.tolist() + ['diagnosis_encoded']].corr()

# Select top 10 features most correlated with diagnosis
top_features = corr["diagnosis_encoded"].abs().sort_values(ascending=False).head(11).index  # 11 because it includes 'diagnosis_encoded'

# Subset correlation matrix for these features
top_corr = df[top_features].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(top_corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Top 10 Features Correlated with Diagnosis")
plt.show()

#**Y-DATA REPORT OR PROFILLING**

In [None]:
# Install if not already installed
!pip install ydata-profiling

import pandas as pd
from ydata_profiling import ProfileReport




In [None]:
# Generate profiling report
profile = ProfileReport(df, title="Breast Cancer Dataset Profiling Report", explorative=True)

# Save to HTML
profile.to_file("profiling_report.html")

# Or display directly in Jupyter Notebook
profile.to_notebook_iframe()

#**PREDICTIONS**

In [None]:
# Example: Logistic Regression predictions
y_pred_log = log_model.predict(X_test)

pred_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted (Logistic Regression)": y_pred_log
})

print(pred_df.head(10))


In [None]:
# Prediction probabilities for all models
pred_proba_df = pd.DataFrame({
    "Actual": y_test.reset_index(drop=True),
    "Logistic Regression": log_model.predict_proba(X_test)[:,1],
    "Random Forest": rf_model.predict_proba(X_test)[:,1],
    "Decision Tree": dt_model.predict_proba(X_test)[:,1],
    "SVM": svm_model.predict_proba(X_test)[:,1],
    "Naive Bayes": nb_model.predict_proba(X_test)[:,1],
    "XGBoost": xgb_model.predict_proba(X_test)[:,1],
})

print(pred_proba_df.head(10))


In [None]:
import matplotlib.pyplot as plt

def plot_predictions(y_test, y_pred, model_name, n=50):
    """
    Plot actual vs predicted values for a given model.
    """
    plt.figure(figsize=(10,5))
    plt.plot(y_test.values[:n], label="Actual", marker="o")
    plt.plot(y_pred[:n], label="Predicted", marker="x")
    plt.title(f"{model_name} Predictions vs Actual")
    plt.xlabel("Sample Index")
    plt.ylabel("Class (0=Benign, 1=Malignant)")
    plt.legend()
    plt.show()

# Example usage
plot_predictions(y_test, y_prd_log, "Logistic Regression")
plot_predictions(y_test, y_pred_rf, "Random Forest")
plot_predictions(y_test, y_pred_dt, "Decision Tree")
plot_predictions(y_test, y_pred_svm, "SVM")
plot_predictions(y_test, y_pred_nb, "Naive Bayes")
plot_predictions(y_test, y_pred_xgb, "XGBoost")
