In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# 1. Data Collection & Loading
# Load the dataset from the CSV file
# Ensure the CSV file is in the same directory as your notebook or provide the full path
try:
    data_frame = pd.read_csv('breast_cancer_data.csv')
    print("Dataset loaded successfully from 'breast_cancer_data.csv'")
except FileNotFoundError:
    print("Error: 'breast_cancer_data.csv' not found. Please ensure the CSV file is in the correct directory.")
    # Fallback to sklearn dataset if CSV is not found, for demonstration purposes
    from sklearn.datasets import load_breast_cancer
    breast_cancer_data = load_breast_cancer()
    data_frame = pd.DataFrame(breast_cancer_data.data, columns=breast_cancer_data.feature_names)
    data_frame['diagnosis'] = breast_cancer_data.target
    print("Loaded Breast Cancer dataset from sklearn as a fallback.")

# Handle the 'id' column if it exists and is not needed for prediction
if 'id' in data_frame.columns:
    data_frame = data_frame.drop(columns='id', axis=1)

# Convert 'diagnosis' column (M/B) to numerical (0/1)
# 'M' (Malignant) will be 0, 'B' (Benign) will be 1
# This mapping aligns with the prediction output interpretation in the PDF (0 for Malignant)
data_frame['diagnosis'] = data_frame['diagnosis'].map({'M': 0, 'B': 1})
print("Diagnosis column mapped to numerical values (M:0, B:1).")

print("\n--- Data Collection & Loading Complete ---")

Dataset loaded successfully from 'breast_cancer_data.csv'
Diagnosis column mapped to numerical values (M:0, B:1).

--- Data Collection & Loading Complete ---


In [2]:
# 2. Exploratory Data Analysis (EDA)

# Display the first five rows of the dataset
print("\nFirst 5 rows of the dataset:")
print(data_frame.head())

# Display the last five rows of the dataset
print("\nLast 5 rows of the dataset:")
print(data_frame.tail())

# Use .shape to check the number of rows and columns
print(f"\nNumber of rows and columns (shape): {data_frame.shape}")

# Use .info() for an overview of column types and non-null values
print("\nInformation about the dataset (info()):")
data_frame.info()

# Check for missing values using .isnull().sum()
print("\nMissing values per column:")
print(data_frame.isnull().sum())

# Display summary statistics with .describe()
print("\nSummary statistics of the dataset (describe()):")
print(data_frame.describe())

# Analyze the target variable distribution using .value_counts()
print("\nDistribution of the target variable ('diagnosis'):")
print(data_frame['diagnosis'].value_counts())
print("Value 0 represents Malignant tumors, Value 1 represents Benign tumors.")

print("\n--- Exploratory Data Analysis Complete ---")


First 5 rows of the dataset:
   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          0        17.99         10.38          122.80     1001.0   
1          0        20.57         17.77          132.90     1326.0   
2          0        19.69         21.25          130.00     1203.0   
3          0        11.42         20.38           77.58      386.1   
4          0        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.241

In [3]:
# 3. Data Preprocessing

# Separate the features (X) and target variable (Y)
# All columns except 'diagnosis' are features
X = data_frame.drop(columns='diagnosis', axis=1)
Y = data_frame['diagnosis']

print(f"\nShape of features (X): {X.shape}")
print(f"Shape of target (Y): {Y.shape}")
print("\nFirst 5 rows of Features (X):")
print(X.head())
print("\nFirst 5 values of Target (Y):")
print(Y.head())

print("\n--- Data Preprocessing Complete ---")


Shape of features (X): (99, 30)
Shape of target (Y): (99,)

First 5 rows of Features (X):
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

 

In [4]:
# 4. Splitting the Dataset

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of Y_train: {Y_train.shape}")
print(f"Shape of Y_test: {Y_test.shape}")

print("\n--- Dataset Splitting Complete ---")

print("\n--- Applying Feature Scaling ---")
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully.")
print(f"Shape of X_train_scaled: {X_train_scaled.shape}")
print(f"Shape of X_test_scaled: {X_test_scaled.shape}")

print("\n--- Feature Scaling Complete ---")


Shape of X_train: (79, 30)
Shape of X_test: (20, 30)
Shape of Y_train: (79,)
Shape of Y_test: (20,)

--- Dataset Splitting Complete ---

--- Applying Feature Scaling ---
Features scaled successfully.
Shape of X_train_scaled: (79, 30)
Shape of X_test_scaled: (20, 30)

--- Feature Scaling Complete ---


In [5]:
# 5. Model Training

# Create a Logistic Regression model instance
model = LogisticRegression(max_iter=2000) # Increased max_iter for convergence, common for logistic regression
                                         # especially with varied feature scales.
                                         # For very large datasets, consider using a solver like 'saga' or 'liblinear'
                                         # with specific penalties.

# Train the logistic regression model using the training data
model.fit(X_train_scaled, Y_train)

print("\nLogistic Regression model trained successfully.")

print("\n--- Model Training Complete ---")


Logistic Regression model trained successfully.

--- Model Training Complete ---


In [6]:
# 6. Model Evaluation

# On training data:
# Make predictions on the SCALED training data
X_train_prediction = model.predict(X_train_scaled) # IMPORTANT: Use X_train_scaled here

# Calculate the accuracy score for training data
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print(f"\nAccuracy on training data = {training_data_accuracy:.4f}")

# On testing data:
# Make predictions on the SCALED testing data
X_test_prediction = model.predict(X_test_scaled) # IMPORTANT: Use X_test_scaled here

# Calculate the accuracy score for testing data
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print(f"Accuracy on test data     = {test_data_accuracy:.4f}")

print("\n--- Model Evaluation Complete ---")


Accuracy on training data = 0.9873
Accuracy on test data     = 0.9500

--- Model Evaluation Complete ---


In [8]:
# 7. Building a Predictive System

# Input a sample data point (ensure it has 30 features as per the dataset)
# This sample data point corresponds to a 'B' (Benign) diagnosis from your CSV (id 8510426)
input_data = (13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,
              0.2699,0.7886,2.058,23.56,0.008462,0.0146,0.02387,0.01315,0.0198,0.0023,
              15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259)

# Convert the input data into a NumPy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the numpy array as we are predicting for one data point
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# IMPORTANT: Scale the single input data point using the SAME scaler fitted on the training data
input_data_scaled = scaler.transform(input_data_reshaped)

# Predict the output using the trained model (now expects scaled input)
prediction = model.predict(input_data_scaled) # IMPORTANT: Use input_data_scaled here
print(f"\nRaw prediction output: {prediction[0]}")

# Interpret the prediction
if (prediction[0] == 0):
  print('The Breast Cancer is Malignant')
else:
  print('The Breast Cancer is Benign')

# Test with a known Malignant sample from your CSV (e.g., id 842302)
malignant_sample = (17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,
                    1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,
                    25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189)

malignant_sample_reshaped = np.asarray(malignant_sample).reshape(1, -1)

# IMPORTANT: Scale the malignant sample using the SAME scaler
malignant_sample_scaled = scaler.transform(malignant_sample_reshaped)

malignant_prediction = model.predict(malignant_sample_scaled) # IMPORTANT: Use malignant_sample_scaled here

print(f"\nRaw prediction for a known Malignant sample: {malignant_prediction[0]}")
if (malignant_prediction[0] == 0):
  print('The Breast Cancer is Malignant (Correctly Predicted)')
else:
  print('The Breast Cancer is Benign (Incorrectly Predicted)')

print("\n--- Predictive System Complete ---")


Raw prediction output: 1
The Breast Cancer is Benign

Raw prediction for a known Malignant sample: 0
The Breast Cancer is Malignant (Correctly Predicted)

--- Predictive System Complete ---


