##**Final Python Notebook 1 : Data Understanding, Cleaning and Preparation**


**Author:** Ashalya Nelson Paranagama

**Code Peer Reviewer:** Dinuli Hasara Hendawitharana

**Date of Peer Review:** 25 February 2025

###Code reused from: Seminar Session 1 and 2 – Data Loading and Libraries

In [None]:
# Importing necessary libraries for data manipulation, visualization, and modeling

# Data handling and analysis
import pandas as pd

# Numerical operations
import numpy as np

# Data visualization
import matplotlib.pyplot as plt

# Statistical data visualization
import seaborn as sns

# Import Plotly Express for interactive visualizations
import plotly.express as px

# Splitting the data into training and testing
from sklearn.model_selection import train_test_split

# Data scaling and encoding
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Metrics for evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Additional sklearn preprocessing
from sklearn import preprocessing

# Normalization
from sklearn.preprocessing import MinMaxScaler

###Code reused from: Seminar Session 1 and 2 – Data Loading and Initial Exploration

In [None]:
# Load the breast cancer datase
df = pd.read_csv('/content/drive/MyDrive/ML/CW/Coursework Dataset(25012025v6.0).csv')

###Code reused from: Seminar Session 1 and 2 – Display the first five rows in a dataset in here from zero to four

In [None]:
# Display the first 5 rows
df.head()

Unnamed: 0,Patient_ID,Month_of_Birth,Age,Sex,Occupation,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months,Mortality_Status
0,A0012,12,68.0,Female,Teaching,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1,60,Alive
1,A0013,12,50.0,Female,Medical,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5,62,Alive
2,A0014,11,58.0,Female,Engineering,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7,75,Alive
3,A0015,3,58.0,Female,Technology,T1,N1,IIA,Poorly differentiated,3,Regional,18.0,Positive,Positive,2.0,1,84,Alive
4,A0016,1,47.0,Female,Multimedia,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,Positive,Positive,3.0,1,50,Alive


###Code reused from: Seminar Session 1 and 2 – Display the names of the columns in the dataset

In [None]:
# Display column names
list(df.columns)

['Patient_ID',
 'Month_of_Birth',
 'Age',
 'Sex',
 'Occupation',
 'T_Stage',
 'N_Stage',
 '6th_Stage',
 'Differentiated',
 'Grade',
 'A_Stage',
 'Tumor_Size',
 'Estrogen_Status',
 'Progesterone_Status',
 'Regional_Node_Examined',
 'Reginol_Node_Positive',
 'Survival_Months',
 'Mortality_Status']

###Code reused from: Seminar Session 1 and 2 – Summary of the dataset that includes the data type of each column

In [None]:
# Dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Patient_ID              4024 non-null   object 
 1   Month_of_Birth          4024 non-null   int64  
 2   Age                     4015 non-null   float64
 3   Sex                     4020 non-null   object 
 4   Occupation              43 non-null     object 
 5   T_Stage                 4024 non-null   object 
 6   N_Stage                 4024 non-null   object 
 7   6th_Stage               4024 non-null   object 
 8   Differentiated          4024 non-null   object 
 9   Grade                   4024 non-null   int64  
 10  A_Stage                 4024 non-null   object 
 11  Tumor_Size              4021 non-null   float64
 12  Estrogen_Status         4024 non-null   object 
 13  Progesterone_Status     4024 non-null   object 
 14  Regional_Node_Examined  4023 non-null   

### Code reused from: Seminar Session 1 and 2 – Summarise the central tendency, dispersion, and shape of a dataset’s distribution, excluding NaN values

In [None]:
# Numerical summary
df.describe()

Unnamed: 0,Month_of_Birth,Age,Grade,Tumor_Size,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months
count,4024.0,4015.0,4024.0,4021.0,4023.0,4024.0,4024.0
mean,6.481362,54.107098,2.150596,30.419299,14.373602,4.158052,71.472167
std,3.475442,11.715528,0.638234,21.16108,8.129293,5.109331,25.361855
min,1.0,-50.0,1.0,-75.0,1.0,1.0,1.0
25%,3.0,47.0,2.0,16.0,9.0,1.0,56.0
50%,6.0,54.0,2.0,25.0,14.0,2.0,73.0
75%,10.0,61.0,3.0,38.0,19.0,5.0,90.0
max,12.0,502.0,4.0,140.0,61.0,46.0,760.0


###  Code reused from: Seminar Session 1 and 2 - Display the count, unique, top, and freq

In [None]:
# Categorical summary
df.describe(include='object')

Unnamed: 0,Patient_ID,Sex,Occupation,T_Stage,N_Stage,6th_Stage,Differentiated,A_Stage,Estrogen_Status,Progesterone_Status,Mortality_Status
count,4024,4020,43,4024,4024,4024,4024,4024,4024,4024,4024
unique,4024,2,40,4,3,5,4,2,2,2,7
top,A4035,Female,House Person,T2,N1,IIA,Moderately differentiated,Regional,Positive,Positive,Alive
freq,1,4001,2,1786,2732,1305,2351,3932,3755,3326,3399


###  Code reused from: Seminar Session 1 and 2 - Display the columns features and the rows instances

In [None]:
# Dataset shape
df.shape

(4024, 18)

### Code reused from: Seminar Session 1 and 2 – Handle Missing Values

In [None]:
# Missing values count
print("Missing values after cleaning:", df.isnull().sum())

Missing values after cleaning: Patient_ID                   0
Month_of_Birth               0
Age                          9
Sex                          4
Occupation                3981
T_Stage                      0
N_Stage                      0
6th_Stage                    0
Differentiated               0
Grade                        0
A_Stage                      0
Tumor_Size                   3
Estrogen_Status              0
Progesterone_Status          0
Regional_Node_Examined       1
Reginol_Node_Positive        0
Survival_Months              0
Mortality_Status             0
dtype: int64


In [None]:
# Missing data percentage
missing_data = df.isnull().sum() / len(df) * 100
print(f"Missing Data Percentage per Feature: \n{missing_data}")

Missing Data Percentage per Feature: 
Patient_ID                 0.000000
Month_of_Birth             0.000000
Age                        0.223658
Sex                        0.099404
Occupation                98.931412
T_Stage                    0.000000
N_Stage                    0.000000
6th_Stage                  0.000000
Differentiated             0.000000
Grade                      0.000000
A_Stage                    0.000000
Tumor_Size                 0.074553
Estrogen_Status            0.000000
Progesterone_Status        0.000000
Regional_Node_Examined     0.024851
Reginol_Node_Positive      0.000000
Survival_Months            0.000000
Mortality_Status           0.000000
dtype: float64


###  Code reused from: Seminar Session 1 and 2 -Removing unnecessary variables from the Analysis

In [None]:
# Drop irrelevant columns
df.drop(columns=['Patient_ID', 'Month_of_Birth', 'Occupation'], inplace=True)
df.head()

Unnamed: 0,Age,Sex,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months,Mortality_Status
0,68.0,Female,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1,60,Alive
1,50.0,Female,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5,62,Alive
2,58.0,Female,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7,75,Alive
3,58.0,Female,T1,N1,IIA,Poorly differentiated,3,Regional,18.0,Positive,Positive,2.0,1,84,Alive
4,47.0,Female,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,Positive,Positive,3.0,1,50,Alive


### Code reused from: Seminar Session 1 and 2 – Drops the rows having null vlues

In [None]:
# Drop rows with missing values
df.dropna(inplace=True)
df.shape

(4007, 15)

### Code reused from: Seminar Session 1 and 2 – Getting a specific value of a variable

In [None]:
# Unique values in 'Sex'
df['Sex'].unique()

array(['Female', '1'], dtype=object)

### Code reused from: Seminar Session 1 and 2 –  Renaming categorical values (labels)

In [None]:
# Encode 'Sex' column
df['Sex'] = df['Sex'].map({'1': 1, 'Female': 0})

# Verify encoding
df['Sex'].unique()

array([0, 1])

In [None]:
# Display the distinct values in the 'Mortality_Status' column
df['Mortality_Status'].unique()

array(['Alive', 'Dead', 'ALIVE', 'DEAD', 'ALive', 'alive', 'dead'],
      dtype=object)

In [None]:
# Check value counts for categorical features
df['Mortality_Status'].value_counts()

Unnamed: 0_level_0,count
Mortality_Status,Unnamed: 1_level_1
Alive,3383
Dead,597
DEAD,10
dead,8
ALIVE,5
alive,3
ALive,1


### Code reused from: Seminar Session 1 and 2 – Encoding categorical values for each variables

In [None]:
# Encode the 'T_Stage' column by mapping each stage ('T1', 'T2', 'T3', 'T4') to numeric codes
df['T_Stage'] = df['T_Stage'].map({'T1': 1, 'T2': 2, 'T3': 3, 'T4': 4})

# Display the unique numeric values in the 'T_Stage' column to verify the encoding
df['T_Stage'].unique()

array([1, 2, 3, 4])

In [None]:
# Encode the 'N_Stage' column by mapping each stage ('N1', 'N2', 'N3') to numeric codes
df['N_Stage'] = df['N_Stage'].map({'N1': 1, 'N2': 2, 'N3': 3})

# Display the unique numeric values in the 'N_Stage' column to confirm the encoding
df['N_Stage'].unique()

array([1, 2, 3])

In [None]:
# Encode the '6th_Stage' column by mapping each stage ('IIA', 'IIIA', 'IIIC', 'IIB', 'IIIB') to numeric codes
df['6th_Stage'] = df['6th_Stage'].map({'IIA': 1, 'IIIA': 2, 'IIIC': 3, 'IIB': 4, 'IIIB': 5})

# Display the unique numeric values in the '6th_Stage' column to verify the encoding
df['6th_Stage'].unique()

array([1, 2, 3, 4, 5])

In [None]:
# Encode the 'Differentiated' column by mapping each category to a numeric code
# based on the level of differentiation: Poorly (1), Moderately (2), Well (3), Undifferentiated (0)
df['Differentiated'] = df['Differentiated'].map({'Poorly differentiated': 1, 'Moderately differentiated': 2, 'Well differentiated': 3, 'Undifferentiated': 0 })

# Display the unique numeric values in the 'Differentiated' column to verify the encoding
df['Differentiated'].unique()

array([1, 2, 3, 0])

In [None]:
# Initialize the LabelEncoder from scikit-learn to encode categorical variables as numeric labels
label_encoder = preprocessing.LabelEncoder()

# Apply LabelEncoder to the 'A_Stage' column and transform its categorical values into numeric labels
df['A_Stage']= label_encoder.fit_transform(df['A_Stage'])

# Display the unique numeric labels in the 'A_Stage' column to verify the encoding
df['A_Stage'].unique()

array([1, 0])

In [None]:
# Encode the 'Estrogen_Status' column by mapping 'Positive' to 1 and 'Negative' to 0
df['Estrogen_Status'] = df['Estrogen_Status'].map({'Positive': 1, 'Negative': 0})

# Display the unique numeric values in the 'Estrogen_Status' column to verify the encoding
df['Estrogen_Status'].unique()

array([1, 0])

In [None]:
# Encode the 'Progesterone_Status' column by mapping 'Positive' to 1 and 'Negative' to 0
df['Progesterone_Status'] = df['Progesterone_Status'].map({'Positive': 1, 'Negative': 0})

# Display the unique numeric values in the 'Progesterone_Status' column to verify the encoding
df['Progesterone_Status'].unique()

array([1, 0])

In [None]:
# Convert all values in the 'Mortality_Status' column to lowercase for consistency,
# then encode 'alive' as 0 and 'dead' as 1
df['Mortality_Status'] = df['Mortality_Status'].str.lower().map({'alive': 0, 'dead': 1})

# Display the unique numeric values in the 'Mortality_Status' column to verify the encoding
df['Mortality_Status'].unique()

array([0, 1])

### Code reused from: Seminar Session 1 and 2 – Save Cleaned Data

In [None]:
# Save the cleaned and preprocessed dataset to a CSV file for future analysis or model training
df.to_csv('/content/drive/MyDrive/ML/CW/prepared_breast_cancer_data.csv', index=False)

In [None]:
# Load the cleaned and preprocessed breast cancer dataset from the specified CSV file
prepared_df = pd.read_csv('/content/drive/MyDrive/ML/CW/prepared_breast_cancer_data.csv')

### Code reused from: Seminar Session 1 and 2 – Data Exploration and Visualisation (Histograms)

In [None]:
# Create a histogram plot to visualize the distribution of the 'Age' variable
Age_fig = px.histogram(prepared_df, x='Age')

# Display the histogram
Age_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Sex' variable
Sex_fig = px.histogram(prepared_df, x='Sex')

# Display the histogram
Sex_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'T_Stage' variable
T_Stage_fig = px.histogram(prepared_df, x='T_Stage')

# Display the histogram
T_Stage_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'N_Stage' variable
N_Stage_fig = px.histogram(prepared_df, x='N_Stage')

# Display the histogram
N_Stage_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the '6th_Stage' variable
sixth_Stage_fig = px.histogram(prepared_df, x='6th_Stage')

# Display the histogram
sixth_Stage_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Differentiated' variable
Differentiated_fig = px.histogram(prepared_df, x='Differentiated')

# Display the histogram
Differentiated_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Grade' variable
Grade_fig = px.histogram(prepared_df, x='Grade')

# Display the histogram
Grade_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'A_Stage' variable
A_Stage_fig = px.histogram(prepared_df, x='A_Stage')

# Display the histogram
A_Stage_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Tumor_Size' variable
Tumor_Size_fig = px.histogram(prepared_df, x='Tumor_Size')

# Display the histogram
Tumor_Size_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Estrogen_Status' variable
Estrogen_Status_fig = px.histogram(prepared_df, x='Estrogen_Status')

# Display the histogram
Estrogen_Status_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Progesterone_Status' variable
Progesterone_Status_fig = px.histogram(prepared_df, x='Progesterone_Status')

# Display the histogram
Progesterone_Status_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Regional_Node_Examined' variable
RNE_fig = px.histogram(prepared_df, x='Regional_Node_Examined')

# Display the histogram
RNE_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Regional_Node_Positive' variable
RNP_fig = px.histogram(prepared_df, x='Reginol_Node_Positive')

# Display the histogram
RNP_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Survival_Months' variable
SM_fig = px.histogram(prepared_df, x='Survival_Months')

# Display the histogram
SM_fig.show()

In [None]:
# Create a histogram to visualize the distribution of the 'Mortality_Status' variable
MS_fig = px.histogram(prepared_df, x='Mortality_Status')

# Display the histogram
MS_fig.show()

### Code reused from: Seminar Session 1 and 2 – Scatter plot

In [None]:
# Create a scatter plot to visualize the relationship between 'Age' and 'Tumor_Size'
Age_Tumor_Size_Association = px.scatter(x=prepared_df['Age'], y=prepared_df['Tumor_Size'])

# Display the scatter plot
Age_Tumor_Size_Association.show()

In [None]:
# Create a scatter plot to visualize the relationship between 'Age' and 'Survival_Months'
Age_Survival_Months_Association = px.scatter(prepared_df, x="Age", y="Survival_Months")

# Display the scatter plot
Age_Survival_Months_Association.show()

### Code reused from: Seminar Session 1 and 2 – Finding Outliers using data visualisations

In [None]:
# Create a box plot to visualize the distribution of 'Age'
Age_fig = px.box(prepared_df, x="Age")

# Display the box plot
Age_fig.show()

In [None]:
# Create a box plot to visualize the distribution of 'Grade'
Grade_fig = px.box(prepared_df, x="Grade")

# Display the box plot
Grade_fig.show()

In [None]:
# Create a box plot to visualize the distribution of 'Tumor_Size'
Tumor_Size_fig = px.box(prepared_df, x="Tumor_Size")

# Display the box plot
Tumor_Size_fig.show()

In [None]:
# Create a box plot to visualize the distribution of 'Regional_Node_Examined'
RNE_fig = px.box(prepared_df, x="Regional_Node_Examined")

# Display the box plot
RNE_fig.show()

In [None]:
# Create a box plot to visualize the distribution of 'Reginol_Node_Positive'
RNP_fig = px.box(prepared_df, x="Reginol_Node_Positive")

# Display the box plot
RNP_fig.show()

In [None]:
# Create a box plot to visualize the distribution of 'Survival_Months'
SM_fig = px.box(prepared_df, x="Survival_Months")

# Display the box plot
SM_fig.show()

### Code reused from: Seminar Session 1 and 2 – Using IQR statistical method to find outliers

In [None]:
# Define a function to detect outliers in a dataset using the Interquartile Range (IQR) method
def find_outliers_IQR(prepared_df):

  #Calculate the first (Q1) and third (Q3) quartiles
  q1=prepared_df.quantile(0.25)
  q3=prepared_df.quantile(0.75)

  # Compute the Interquartile Range (IQR)
  IQR=q3-q1

  # Identify outliers as values outside the range [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
  outliers = prepared_df[((prepared_df<(q1-1.5*IQR))|(prepared_df>(q3+1.5*IQR)))]
  return outliers

### Code reused from: Seminar Session 1 and 2 – Calling IQR function for Age variable to find outliers

In [None]:
# Find outliers in the 'Age' column
outliers = find_outliers_IQR(prepared_df['Age'])

# Print the number of outliers found in the 'Age' variable
print('Number of outliers (Age):',len(outliers) )

Number of outliers (Age): 4


In [None]:
# Finding the number of outliers in the 'Age' variable and printing them
outliers = find_outliers_IQR(prepared_df['Age'])

# Print the number of outliers in 'Age'
print("No. of outliers (Age): " + str(len(outliers)))

No. of outliers (Age): 4


### Code reused from: Seminar Session 1 and 2 – Using drop function to Remove outliers of Age

In [None]:
# Removing specific outliers from the 'Age' variable based on identified outlier indices
# The indices of the outliers are 139, 209, 512, and 829
prepared_df.drop(prepared_df.index[[139, 209, 512, 829]], inplace=True)

# Verify if the outliers were successfully removed by checking the shape of the data
print("Updated dataset shape:", prepared_df.shape)

Updated dataset shape: (4003, 15)


### Code reused from: Seminar Session 1 and 2 – Calling IQR function for Grade variable to find outliers

In [None]:
# Finding the Number of outliers in the 'Grade' variable and print the result
outliers = find_outliers_IQR(prepared_df['Grade'])

# Printing the number of outliers found
print('Number of outliers (Grade):', len(outliers))

Number of outliers (Grade): 0


In [None]:
# Finding the number of outliers in the 'Grade' variable and print the result
outliers = find_outliers_IQR(prepared_df['Grade'])

# This will print the number of outliers found in 'Grade'
print("No. of outliers (Grade): " + str(len(outliers)))

No. of outliers (Grade): 0


In [None]:
# Create a box plot for the 'Age' column
Age_fig = px.box(prepared_df, x="Age")

# Show the plot in the output
Age_fig.show()

### Code reused from: McKinney, W. (2018). Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython. 2nd ed. O'Reilly Media.

In [None]:
# Function to cap outliers in a numeric pandas Series using the IQR method.
# This function caps the outliers in the data instead of removing them.
# Values below Q1 - 1.5*IQR are capped to the lower bound,
# and values above Q3 + 1.5*IQR are capped to the upper bound.
# Capping reduces the impact of extreme values without removing any data.
def cap_outliers_IQR(series):
    # Calculate the first quartile (Q1) and third quartile (Q3)
    Q1 = series.quantile(0.25)  # 25th percentile, lower quartile
    Q3 = series.quantile(0.75)  # 75th percentile, upper quartile

    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1  # The range between the first and third quartiles

    # Calculate the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR  # Anything below this is an outlier
    upper_bound = Q3 + 1.5 * IQR  # Anything above this is an outlier

    # Create a copy of the series to avoid modifying the original data
    series = series.copy()  # To prevent SettingWithCopyWarning (to modify a view, not the original DataFrame)

    # Cap the values below the lower bound to the lower bound
    series[series < lower_bound] = lower_bound

    # Cap the values above the upper bound to the upper bound
    series[series > upper_bound] = upper_bound

    # Return the series with capped outliers
    return series

In [None]:
# Apply the custom IQR-based function to detect outliers in the 'Tumor_Size' column
outliers = find_outliers_IQR(prepared_df['Tumor_Size'])

# Print the total number of outlier values detected in the 'Tumor_Size' column
print('Number of outliers (Tumor_Size):', len(outliers))

Number of outliers (Tumor_Size): 217


### Code reused from: Seminar Session 1 and 2 – Using drop function to Remove Extream outliers of Tumor_Size

In [None]:
# Remove an extreme outlier from the 'Tumor_Size' variable by dropping the row with index 207 from the DataFrame
prepared_df.drop(index=207, inplace=True)
prepared_df.shape

(4002, 15)

### Code reused from: Leys, C., Ley, C., Klein, O., Bernard, P. and Licata, L., 2013. Detecting outliers: Do not use standard deviation around the mean, use absolute deviation around the median. Journal of Experimental Social Psychology, 49(4), pp.764–766. https://doi.org/10.1016/j.jesp.2013.03.013

In [None]:
# Cap the outliers in the 'Tumor_Size' column using the IQR method to reduce the influence of extreme values
prepared_df.loc[:, 'Tumor_Size'] = cap_outliers_IQR(prepared_df['Tumor_Size'])

In [None]:
# Create a box plot to visualize the distribution and detect potential outliers in the 'Tumor_Size' variable
Tumor_Size_fig = px.box(prepared_df, x="Tumor_Size")

# Display the generated box plot in the output cell
Tumor_Size_fig.show()

### Code reused from: Seminar Session 1 and 2 – Calling IQR function for Regional_Node_Examined variable to find outliers

In [None]:
# Use the IQR method to detect outliers in the 'Regional_Node_Examined' column
outliers = find_outliers_IQR(prepared_df['Regional_Node_Examined'])

# Print the number of outliers detected as a formatted string
print("No. of outliers (Regional_Node_Examined): " + str(len(outliers)))

No. of outliers (Regional_Node_Examined): 73


### Code reused from:Tukey, J.W., 1977. Exploratory Data Analysis. Reading, MA: Addison-Wesley.

In [None]:
# Cap extreme outliers in the 'Regional_Node_Examined' column using the IQR method to reduce their influence without data loss
prepared_df.loc[:, 'Regional_Node_Examined'] = cap_outliers_IQR(prepared_df['Regional_Node_Examined'])

In [None]:
# Create a box plot for the 'Regional_Node_Examined' column to visually identify the distribution and any potential outliers
RNE_fig = px.box(prepared_df, x="Regional_Node_Examined")

# Display the box plot to analyze the distribution of the 'Regional_Node_Examined' variable
RNE_fig.show()

### Code reused from: Seminar Session 1 and 2 – Calling IQR function for Reginol_Node_Positive variable to find outliers

In [None]:
# Find the outliers in the 'Reginol_Node_Positive' column by using the IQR method to detect extreme values.
outliers = find_outliers_IQR(prepared_df['Reginol_Node_Positive'])

# Print the number of outliers detected in the 'Reginol_Node_Positive' variable
print('Number of outliers (Reginol_Node_Positive):', len(outliers))

Number of outliers (Reginol_Node_Positive): 341


In [None]:
# Use the IQR method to detect outliers in the 'Reginol_Node_Positive' column.
outliers = find_outliers_IQR(prepared_df['Reginol_Node_Positive'])

# Print the number of outliers detected in the 'Reginol_Node_Positive' variable.
print("No. of outliers (Reginol_Node_Positive): " + str(len(outliers)))

No. of outliers (Reginol_Node_Positive): 341


In [None]:
# Create a box plot for the 'Reginol_Node_Positive' column to visually inspect the distribution
# and identify potential outliers.
RNP_fig = px.box(prepared_df, x="Reginol_Node_Positive")

# Display the constructed box plot to analyze the distribution and detect any extreme values.
RNP_fig.show()

### Code reused from: Seminar Session 1 and 2 – Calling IQR function for Survival_Months variable to find outliers

In [None]:
# Identify and count the number of outliers in the 'Survival_Months' column using the IQR method.
# This helps to detect any extreme values that could skew the analysis or modeling results.
outliers = find_outliers_IQR(prepared_df['Survival_Months'])

# Print the number of outliers found in the 'Survival_Months' variable.
print('Number of outliers (Survival_Months):', len(outliers))

Number of outliers (Survival_Months): 19


In [None]:
# Find outliers in the 'Survival_Months' variable by using the IQR method.
# This step helps identify any extreme values in the data, which can significantly impact model accuracy.
outliers = find_outliers_IQR(prepared_df['Survival_Months'])

# Print the number of outliers detected in the 'Survival_Months' variable for further analysis.
print("No. of outliers (Survival_Months): " + str(len(outliers)))

No. of outliers (Survival_Months): 19


### Code reused from: Seminar Session 1 and 2 – Using drop function to Remove Extream outliers of Survival_Months

In [None]:
# Remove the extreme outlier value (760) from the 'Survival_Months' column to improve data quality.
# This helps prevent skewing the distribution and ensures more reliable regression model performance.
prepared_df = prepared_df[prepared_df['Survival_Months'] != 760]
prepared_df.shape

(4001, 15)

### Code reused from: McKinney, W. (2018) Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython. 2nd ed. O'Reilly Media.

In [None]:
# Explicitly convert the 'Survival_Months' column to float64 before assigning the capped values
prepared_df['Survival_Months'] = prepared_df['Survival_Months'].astype(float)

# Apply the cap_outliers_IQR function to the 'Survival_Months' column of the data DataFrame
prepared_df.loc[:, 'Survival_Months'] = cap_outliers_IQR(prepared_df['Survival_Months'])

In [None]:
# Construct a box plot for the 'Survival_Months' variable to visualize the distribution
# and detect any remaining outliers after data cleaning.
SM_fig = px.box(prepared_df, x="Survival_Months")

# Display the box plot to examine the spread and detect potential outliers visually.
SM_fig.show()

In [None]:
# Save the cleaned dataset to a CSV file for future use in modeling or analysis.
# The index is excluded to keep the data clean and in the original format.
prepared_df.to_csv('/content/drive/MyDrive/ML/CW/cleaned_breast_cancer_data.csv', index=False)

In [None]:
#Loading Clean_Coursework Dataset file
clean_df = pd.read_csv('/content/drive/MyDrive/ML/CW/cleaned_breast_cancer_data.csv')

### Code reused from: Seminar Session 1 and 2 – Creating the Classification Dataset from the clean coursework dataset

In [None]:
# Create a new dataset for classification by removing the 'Survival_Months' variable.
# The 'Survival_Months' column is dropped because it is used for regression, not classification.
# The 'errors' argument ensures no error is raised if the column is not found.
classification_data = clean_df.drop(columns=['Survival_Months'], errors='ignore')

In [None]:
# Save the classification dataset to a CSV file for future use or modeling.
# The dataset is saved under the name 'Classification_Mortality_Dataset.csv'.
classification_data.to_csv('/content/drive/MyDrive/ML/CW/Classification_Mortality_Dataset.csv', index=False)

In [None]:
# Create the regression dataset by dropping the 'Mortality_Status' variable.
# This dataset will be used for predicting 'Survival_Months'.
regression_data = clean_df.drop(columns=['Mortality_Status'], errors='ignore')

In [None]:
# Save the regression dataset to a CSV file for future use in predicting 'Survival_Months'.
regression_data.to_csv('/content/drive/MyDrive/ML/CW/Regression_Survival_Dataset.csv', index=False)

In [None]:
# Load the classification dataset from the specified CSV file into a pandas DataFrame
classification_df = pd.read_csv('/content/drive/MyDrive/ML/CW/Classification_Mortality_Dataset.csv')

### Code reused from: Seminar Session 1 and 2 – Convert the Scaled Data Back to a DataFrame

In [None]:
#Creating a StandardScaler object
classification_scaler = StandardScaler()

# Standard Scaling for Classification dataset
classification_data_scaled = classification_scaler.fit_transform(classification_df)
classification_data_scaled

array([[ 1.5622439 , -0.06907584, -1.02440544, ...,  1.27146641,
        -0.61765496, -0.4257715 ],
       [-0.4463554 , -0.06907584,  0.28464795, ..., -0.03042408,
         0.16705171, -0.4257715 ],
       [ 0.4463554 , -0.06907584,  1.59370134, ..., -0.03042408,
         0.55940505, -0.4257715 ],
       ...,
       [ 1.5622439 , 14.47684176,  0.28464795, ..., -0.42099123,
        -0.22530162, -0.4257715 ],
       [ 0.4463554 , -0.06907584,  0.28464795, ..., -0.42099123,
        -0.61765496, -0.4257715 ],
       [-0.8927108 , -0.06907584,  0.28464795, ..., -0.94174743,
        -0.42147829, -0.4257715 ]])

In [None]:
# Converting the scaled NumPy array back to a DataFrame
# and assigning original column names for easy interpretation
classification_data_scaled = pd.DataFrame(classification_data_scaled, columns=classification_data.columns)

# Displaying the scaled classification dataset
classification_data_scaled

Unnamed: 0,Age,Sex,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Mortality_Status
0,1.562244,-0.069076,-1.024405,-0.631305,-1.127311,-1.329218,1.329218,0.151697,-1.428434,0.268476,0.458102,1.271466,-0.617655,-0.425771
1,-0.446355,-0.069076,0.284648,0.812038,-0.325325,0.239573,-0.239573,0.151697,0.322778,0.268476,0.458102,-0.030424,0.167052,-0.425771
2,0.446355,-0.069076,1.593701,2.255382,0.476662,0.239573,-0.239573,0.151697,1.904517,0.268476,0.458102,-0.030424,0.559405,-0.425771
3,0.446355,-0.069076,-1.024405,-0.631305,-1.127311,-1.329218,1.329218,0.151697,-0.637564,0.268476,0.458102,-1.592693,-0.617655,-0.425771
4,-0.781122,-0.069076,0.284648,-0.631305,1.278648,-1.329218,1.329218,0.151697,0.661722,0.268476,0.458102,-1.462504,-0.617655,-0.425771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3996,0.892711,-0.069076,-1.024405,-0.631305,-1.127311,0.239573,-0.239573,0.151697,-1.145980,0.268476,0.458102,-1.722882,-0.617655,-0.425771
3997,0.223178,-0.069076,0.284648,0.812038,-0.325325,0.239573,-0.239573,0.151697,0.944175,0.268476,0.458102,-0.030424,0.755582,-0.425771
3998,1.562244,14.476842,0.284648,-0.631305,1.278648,0.239573,-0.239573,0.151697,-0.411601,0.268476,-2.182918,-0.420991,-0.225302,-0.425771
3999,0.446355,-0.069076,0.284648,-0.631305,1.278648,0.239573,-0.239573,0.151697,0.831194,0.268476,0.458102,-0.420991,-0.617655,-0.425771


In [None]:
# Printing the first five rows of the scaled classification dataset
classification_data_scaled.head()

Unnamed: 0,Age,Sex,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Mortality_Status
0,1.562244,-0.069076,-1.024405,-0.631305,-1.127311,-1.329218,1.329218,0.151697,-1.428434,0.268476,0.458102,1.271466,-0.617655,-0.425771
1,-0.446355,-0.069076,0.284648,0.812038,-0.325325,0.239573,-0.239573,0.151697,0.322778,0.268476,0.458102,-0.030424,0.167052,-0.425771
2,0.446355,-0.069076,1.593701,2.255382,0.476662,0.239573,-0.239573,0.151697,1.904517,0.268476,0.458102,-0.030424,0.559405,-0.425771
3,0.446355,-0.069076,-1.024405,-0.631305,-1.127311,-1.329218,1.329218,0.151697,-0.637564,0.268476,0.458102,-1.592693,-0.617655,-0.425771
4,-0.781122,-0.069076,0.284648,-0.631305,1.278648,-1.329218,1.329218,0.151697,0.661722,0.268476,0.458102,-1.462504,-0.617655,-0.425771


In [None]:
# Checking basic statistics (mean, std, min, max, etc.) for the scaled classification dataset
classification_data_scaled.describe()

Unnamed: 0,Age,Sex,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Mortality_Status
count,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0
mean,3.551826e-18,-1.065548e-17,6.570878000000001e-17,-5.327739e-18,8.701973000000001e-17,1.2165e-16,-1.2165e-16,3.596224e-16,4.0846000000000006e-17,1.900227e-16,3.551826e-17,2.219891e-18,-7.54763e-18,-5.327739e-17
std,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125,1.000125
min,-2.678132,-0.06907584,-1.024405,-0.6313048,-1.127311,-2.898008,-1.808364,-6.592083,-1.597906,-3.724729,-2.182918,-1.722882,-0.617655,-0.4257715
25%,-0.781122,-0.06907584,-1.024405,-0.6313048,-1.127311,-1.329218,-0.2395729,0.1516971,-0.7505453,0.2684759,0.4581025,-0.6813693,-0.617655,-0.4257715
50%,0.0,-0.06907584,0.284648,-0.6313048,-0.3253247,0.2395729,-0.2395729,0.1516971,-0.2421292,0.2684759,0.4581025,-0.03042408,-0.4214783,-0.4257715
75%,0.781122,-0.06907584,0.284648,0.8120384,1.278648,0.2395729,1.329218,0.1516971,0.4922497,0.2684759,0.4581025,0.6205212,0.1670517,-0.4257715
max,1.673833,14.47684,2.902755,2.255382,2.080635,1.808364,2.898008,0.1516971,2.356442,0.2684759,0.4581025,2.573357,8.210295,2.348678


In [None]:
# Save the standardized (scaled) classification dataset to a CSV file for future modeling
classification_data_scaled.to_csv('/content/drive/MyDrive/ML/CW/Scaled_Classification_Mortality_Dataset.csv', index=False)

In [None]:
# Load the regression dataset from the CSV file for survival months prediction
reg_df = pd.read_csv('/content/drive/MyDrive/ML/CW/Regression_Survival_Dataset.csv')

In [None]:
# Creating a MinMaxScaler object to scale regression features to a range [0, 1]
reg_scaler = MinMaxScaler()

# Applying Min-Max scaling to the regression dataset
reg_data_scaled = reg_scaler.fit_transform(reg_df)

# Display the scaled data (as a NumPy array)
reg_data_scaled

array([[0.97435897, 0.        , 0.        , ..., 0.6969697 , 0.        ,
        0.53921569],
       [0.51282051, 0.        , 0.33333333, ..., 0.39393939, 0.08888889,
        0.55882353],
       [0.71794872, 0.        , 0.66666667, ..., 0.39393939, 0.13333333,
        0.68627451],
       ...,
       [0.97435897, 1.        , 0.33333333, ..., 0.3030303 , 0.04444444,
        0.62745098],
       [0.71794872, 0.        , 0.33333333, ..., 0.3030303 , 0.        ,
        0.65686275],
       [0.41025641, 0.        , 0.33333333, ..., 0.18181818, 0.02222222,
        0.93137255]])

In [None]:
# Converting the scaled NumPy array back into a DataFrame
# The column names from the original regression dataset are retained
reg_data_scaled = pd.DataFrame(reg_data_scaled, columns=reg_df.columns)

# Display the scaled regression dataset
reg_data_scaled

Unnamed: 0,Age,Sex,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months
0,0.974359,0.0,0.000000,0.0,0.00,0.333333,0.666667,1.0,0.042857,1.0,1.0,0.696970,0.000000,0.539216
1,0.512821,0.0,0.333333,0.5,0.25,0.666667,0.333333,1.0,0.485714,1.0,1.0,0.393939,0.088889,0.558824
2,0.717949,0.0,0.666667,1.0,0.50,0.666667,0.333333,1.0,0.885714,1.0,1.0,0.393939,0.133333,0.686275
3,0.717949,0.0,0.000000,0.0,0.00,0.333333,0.666667,1.0,0.242857,1.0,1.0,0.030303,0.000000,0.774510
4,0.435897,0.0,0.333333,0.0,0.75,0.333333,0.666667,1.0,0.571429,1.0,1.0,0.060606,0.000000,0.441176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3996,0.820513,0.0,0.000000,0.0,0.00,0.666667,0.333333,1.0,0.114286,1.0,1.0,0.000000,0.000000,0.431373
3997,0.666667,0.0,0.333333,0.5,0.25,0.666667,0.333333,1.0,0.642857,1.0,1.0,0.393939,0.155556,0.627451
3998,0.974359,1.0,0.333333,0.0,0.75,0.666667,0.333333,1.0,0.300000,1.0,0.0,0.303030,0.044444,0.627451
3999,0.717949,0.0,0.333333,0.0,0.75,0.666667,0.333333,1.0,0.614286,1.0,1.0,0.303030,0.000000,0.656863


In [None]:
# Displaying the first five rows of the scaled regression dataset
# This helps in verifying if scaling has been correctly applied and structure is intact
reg_data_scaled.head()

Unnamed: 0,Age,Sex,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months
0,0.974359,0.0,0.0,0.0,0.0,0.333333,0.666667,1.0,0.042857,1.0,1.0,0.69697,0.0,0.539216
1,0.512821,0.0,0.333333,0.5,0.25,0.666667,0.333333,1.0,0.485714,1.0,1.0,0.393939,0.088889,0.558824
2,0.717949,0.0,0.666667,1.0,0.5,0.666667,0.333333,1.0,0.885714,1.0,1.0,0.393939,0.133333,0.686275
3,0.717949,0.0,0.0,0.0,0.0,0.333333,0.666667,1.0,0.242857,1.0,1.0,0.030303,0.0,0.77451
4,0.435897,0.0,0.333333,0.0,0.75,0.333333,0.666667,1.0,0.571429,1.0,1.0,0.060606,0.0,0.441176


In [None]:
# Checking the basic statistics for the scaled (normalized) data variables
# This will show the count, mean, standard deviation, minimum, maximum, and quartiles for each variable
reg_data_scaled.describe()

Unnamed: 0,Age,Sex,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months
count,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0,4001.0
mean,0.615385,0.004749,0.260851,0.218695,0.351412,0.615763,0.384237,0.977506,0.404088,0.932767,0.826543,0.401021,0.069966,0.649681
std,0.22981,0.068756,0.254669,0.346461,0.311765,0.212504,0.212504,0.148303,0.252918,0.250457,0.378689,0.232791,0.113291,0.224664
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.435897,0.0,0.0,0.0,0.0,0.333333,0.333333,1.0,0.214286,1.0,1.0,0.242424,0.0,0.5
50%,0.615385,0.0,0.333333,0.0,0.25,0.666667,0.333333,1.0,0.342857,1.0,1.0,0.393939,0.022222,0.666667
75%,0.794872,0.0,0.333333,0.5,0.75,0.666667,0.666667,1.0,0.528571,1.0,1.0,0.545455,0.088889,0.833333
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Save the scaled dataset for the regression task (predicting Survival Months)
reg_data_scaled.to_csv('/content/drive/MyDrive/ML/CW/Scaled_Regression_Survival_Dataset.csv', index=False)