# Breast Cancer Coimbra Dataset Analysis
This notebook demonstrates the exploratory data analysis (EDA) and model building for breast cancer prediction.

## Step 1: Load and Inspect Data

In [None]:

import pandas as pd

# Load the dataset
csv_file_path = '/mnt/data/breast_cancer_coimbra/dataR2.csv'
df = pd.read_csv(csv_file_path)

# Show the first few rows and basic information
df.head(), df.info(), df.describe(), df.isnull().sum()


## Step 2: Data Cleaning

In [None]:

# Checking if there are any missing values and filling them (if necessary)
missing_values = df.isnull().sum()
missing_values


## Step 3: Visualize Data

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for numerical columns
df.hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()


## Step 4: Correlation Matrix

In [None]:

# Calculate correlation matrix
corr_matrix = df.corr()

# Display heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()


## Step 5: Train-Test Split

In [None]:

from sklearn.model_selection import train_test_split

# Define the features (X) and target (y)
X = df.drop('Classification', axis=1)
y = df['Classification']

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

(X_train.shape, X_val.shape, X_test.shape), (y_train.shape, y_val.shape, y_test.shape)


## Step 6: Train Logistic Regression Model

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train the model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, report, conf_matrix


## Step 7: Make Recommendations Based on Predictions

In [None]:

# Function to make recommendations based on prediction
def make_recommendation(patient_data):
    # Predict the cancer classification and probability
    prediction = model.predict(patient_data)
    probability = model.predict_proba(patient_data)

    # Generate recommendation based on the prediction
    if prediction == 1:
        recommendation = "Recommendation: The patient is at high risk for breast cancer. Further diagnostic tests are recommended."
    else:
        recommendation = "Recommendation: The patient is at low risk. Continue regular health monitoring."
    
    return prediction, probability, recommendation

# Example: New patient data for prediction (replace with actual feature values)
new_patient = [[60, 24.5, 90, 15, 3.4, 12.5, 4.1, 8.3, 550]]  # Replace with real feature values
prediction, probability, recommendation = make_recommendation(new_patient)

prediction, probability, recommendation
