In [None]:
!rm -f data.csv

from google.colab import files

# Opens a file picker in Google Colab so you can upload files from your computer
uploaded = files.upload()

# Loop through all uploaded files
for fn in uploaded.keys():
    # Print the filename and its size in bytes
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])
    ))


In [None]:
# Display the dictionary of uploaded files (from files.upload earlier)
uploaded

# Import pandas for data handling and io for reading file content
import pandas as pd
import io

# Read the uploaded CSV file into a pandas DataFrame
# - uploaded['data.csv'] accesses the uploaded file content (in bytes)
# - .decode('utf-8') converts the bytes into a string
# - io.StringIO(...) makes the string behave like a file
# - pd.read_csv(...) loads the CSV into a structured DataFrame
data = pd.read_csv(io.StringIO(uploaded['data.csv'].decode('utf-8')))

# Display the first 5 rows of the dataset to verify it loaded correctly
data.head()


In [None]:
# Importing seaborn for data visualization
import seaborn as sns

# Creates a count plot (bar chart) of the 'diagnosis' column from the dataframe 'data'
# It shows how many times each category (Benign or Malignant) appears
ax = sns.countplot(data['diagnosis'], label='Count')

# Unpacks the counts of each diagnosis category into variables B and M
# data['diagnosis'].value_counts() returns the frequency of each class
B, M = data['diagnosis'].value_counts()

# Prints the count of Benign cases
print('Benign', B)

# Prints the count of Malignant cases
print('Malignant', M)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# -----------------------------
# Splitting dataset into Features (X) and Labels (y)
# -----------------------------

# X: selecting all rows, and all columns from index 2 onwards (skips 'id' and 'diagnosis').
#    So X contains only the numeric feature columns such as radius_mean, texture_mean, etc.
X = data.iloc[:, 2:].values

# y: selecting the diagnosis column (index 1).
#    This contains 'M' (Malignant) or 'B' (Benign) for each patient.
y = data.iloc[:, 1].values


# -----------------------------
# Encoding categorical labels
# -----------------------------
from sklearn.preprocessing import LabelEncoder

# Creating a LabelEncoder object to convert text labels into numbers
labelencoder_X_1 = LabelEncoder()

# Transform 'M' and 'B' into numeric values:
#   'M' → 1 (Malignant)
#   'B' → 0 (Benign)
y = labelencoder_X_1.fit_transform(y)


# -----------------------------
# Splitting the dataset into Training and Test sets
# -----------------------------
from sklearn.model_selection import train_test_split

# Divide dataset into training and test data:
#   X_train, y_train → used to train the model (80% of data)
#   X_test, y_test   → used to test the model (20% of data)
# random_state=0 ensures the split is reproducible
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)


# -----------------------------
# Feature Scaling
# -----------------------------
from sklearn.preprocessing import StandardScaler

# Creating a StandardScaler object to normalize features
sc = StandardScaler()

# Fit the scaler on the training data and transform it:
# Each feature will have mean=0 and standard deviation=1.
# This ensures that features like 'area_mean' (which can be large, e.g. 1000s)
# don't dominate smaller features like 'smoothness_mean'.
X_train = sc.fit_transform(X_train)

# Apply the same scaling transformation on test data
# (Important: we do not fit again on test data, to avoid data leakage).
X_test = sc.transform(X_test)


In [None]:
X_train

In [None]:
X_test

In [None]:
!pip install keras

In [None]:
import keras

In [None]:
# Import necessary libraries from Keras for building a neural network
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input

# -----------------------------
# Building the Neural Network model
# -----------------------------

# Initialize a Sequential model (a linear stack of layers)
classifier = Sequential()

# Input layer: expects input vectors of size 30 (since dataset has 30 features)
classifier.add(Input(shape=(30,)))

# First hidden layer:
# - Dense = fully connected layer with 16 neurons
# - kernel_initializer='uniform' initializes weights randomly with a uniform distribution
# - activation='relu' introduces non-linearity (ReLU is common in hidden layers)
classifier.add(Dense(units=16, kernel_initializer='uniform', activation='relu'))

# Dropout layer:
# - Randomly drops 50% of the neurons during training
# - Helps prevent overfitting (important since dataset is relatively small, ~569 rows)
classifier.add(Dropout(rate=0.5))


In [None]:
# adding the second hidden layer
classifier.add(Dense(units=16, kernel_initializer='uniform', activation='relu'))
classifier.add(Dropout(rate=0.5))  # probably better to use 0.2–0.5 in practice

# adding the output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))


In [None]:
# Compile the neural network model
classifier.compile(
    optimizer="Adam",              # Adam optimizer: adaptive learning rate, works well for most problems
    loss='binary_crossentropy',    # Loss function for binary classification (Malignant vs Benign)
    metrics=['accuracy']           # Track accuracy during training and testing
)


In [None]:
# Train the neural network on the training data
classifier.fit(
    X_train,        # feature inputs (30 tumor measurements per sample)
    y_train,        # target labels (0 = Benign, 1 = Malignant)
    batch_size=100, # number of samples processed before model updates weights once
    epochs=150      # number of complete passes through the training dataset
)


In [None]:
X_test


In [None]:
# -----------------------------
# Predicting the Test set results
# -----------------------------

# Use the trained classifier to predict probabilities on the test set
# Each output is a value between 0 and 1 (since binary classification with sigmoid output)
y_pred = classifier.predict(X_test)

# Convert probabilities into binary predictions:
# If probability > 0.5 → classify as 1 (Malignant)
# If probability <= 0.5 → classify as 0 (Benign)
y_pred = (y_pred > 0.5)


In [None]:
# -----------------------------
# Making the Confusion Matrix
# -----------------------------

from sklearn.metrics import confusion_matrix

# Create a confusion matrix to evaluate the classifier's performance
# Compares actual labels (y_test) with predicted labels (y_pred)
cm = confusion_matrix(y_test, y_pred)


In [None]:
# -----------------------------
# Visualizing the Confusion Matrix
# -----------------------------

# Plot the confusion matrix as a heatmap
# 'annot=True' writes the numbers (TN, FP, FN, TP) inside the heatmap cells
sns.heatmap(cm, annot=True)

# Save the plotted heatmap as an image file 'h.png'
# This lets you keep a visual copy of your confusion matrix
plt.savefig('h.png')
