# Breast Cancer Diagnostic Neural Network Practice 

## Importing the Required Packages

In [None]:
import pandas as pd  # contains functions for working with datasets
from sklearn.model_selection import train_test_split  # allows for the splitting of the dataset into training and testing sets
from sklearn.preprocessing import StandardScaler, LabelEncoder  # contains functions for preprocessing the dataset, such as normalization and encoding categorical variables
import tensorflow as tf  # provides a comprehensive library for building and training neural networks and other machine learning models
from tensorflow.keras.models import Sequential  # a Keras API for creating neural network models layer by layer
from tensorflow.keras.layers import Dense  # provides the Dense layer, which is a fully connected neural network layer
from tensorflow.keras.optimizers import Adam  # includes the Adam optimizer, a popular optimization algorithm for training neural networks

## Exploring the Dataset

Exploring the dataset is critical to understanding the dataset being worked with. It is important to know the dimensions of the dataset, as well if there are missing or duplicate values. 

### Loading the Dataset 

In [6]:
# Load the data
file_path = 'brca.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0.1,Unnamed: 0,x.radius_mean,x.texture_mean,x.perimeter_mean,x.area_mean,x.smoothness_mean,x.compactness_mean,x.concavity_mean,x.concave_pts_mean,x.symmetry_mean,...,x.texture_worst,x.perimeter_worst,x.area_worst,x.smoothness_worst,x.compactness_worst,x.concavity_worst,x.concave_pts_worst,x.symmetry_worst,x.fractal_dim_worst,y
0,1,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,...,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259,B
1,2,13.08,15.71,85.63,520.0,0.1075,0.127,0.04568,0.0311,0.1967,...,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183,B
2,3,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,...,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773,B
3,4,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,...,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169,B
4,5,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,...,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409,B


### Checking for Missing Values 

This is a function that I have written previously that should work on any dataset. This function goes through each column of the dataset and determines if there were missing values found. 

In [7]:
def check_missing_values(data):
    missing_values = data.isnull().sum()
    return missing_values
# Check for missing values
missing_values_summary = check_missing_values(data)
print(missing_values_summary)

Unnamed: 0             0
x.radius_mean          0
x.texture_mean         0
x.perimeter_mean       0
x.area_mean            0
x.smoothness_mean      0
x.compactness_mean     0
x.concavity_mean       0
x.concave_pts_mean     0
x.symmetry_mean        0
x.fractal_dim_mean     0
x.radius_se            0
x.texture_se           0
x.perimeter_se         0
x.area_se              0
x.smoothness_se        0
x.compactness_se       0
x.concavity_se         0
x.concave_pts_se       0
x.symmetry_se          0
x.fractal_dim_se       0
x.radius_worst         0
x.texture_worst        0
x.perimeter_worst      0
x.area_worst           0
x.smoothness_worst     0
x.compactness_worst    0
x.concavity_worst      0
x.concave_pts_worst    0
x.symmetry_worst       0
x.fractal_dim_worst    0
y                      0
dtype: int64


### Checking for Duplicate Values 

This is another function that I have written previously. This function goes through each row of the dataset to check if there are any duplicates. 

In [None]:
def check_duplicate_values(data):
    duplicate_count = data.duplicated().sum()
    return duplicate_count
# Check for duplicate values
duplicate_values_summary = check_duplicate_values(data)
print(f"Number of duplicate rows: {duplicate_values_summary}")

## Preparing the Dataset 

Now that the data has been explored, it is time to prepare it for the model. <br>
First, we can drop the column that is unnmaed and just contains the patient numbers. <br>
Then, we encode the target variable (turn it into either a 0 or 1) to make the data easier for the model to read. <br>
Then, we define what are the features and what is the target variable. <br>
Finally, setting a random seed ensures that the random processes involved in the training of models produce the same results each time you run the code. This includes processes like the initialization of neural network weights, shuffling of data, and splitting of datasets. The number 42 is arbitrary. 

In [9]:
# Drop the unnamed column that just contains numbers
data = data.drop(columns=['Unnamed: 0'])

# Encode the target variable
# This makes the target variable easier to read for the model
label_encoder = LabelEncoder()
data['y'] = label_encoder.fit_transform(data['y'])

# Separate features and target
X = data.drop(columns=['y'])
y = data['y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the feature columns
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Set random seed for reproducibility
tf.random.set_seed(42)

##  Defining the Neural Network Model

This code snippet defines a neural network model using the Sequential API from Keras. The Sequential model is a linear stack of layers, where each layer has exactly one input tensor and one output tensor. <br>

In [10]:
# Define the neural network model
model = Sequential([
    Dense(64, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification\
])

In [11]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [12]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [13]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Test Loss: 0.14455313980579376
Test Accuracy: 0.9736841917037964


In [15]:
model.save('breastcancernn.h5')