<h1>Table of Contents</h1>

<div class="alert alert-block alert-info" style="margin-top: 20px">
    <ol>
        <li><a href="#import_libraries">Import Libraries</a></li>
        <li><a href="#import_dataset">Import "Pima Indians Diabetes" Dataset</a></li>
        <li><a href="#information">Information about the Dataset</a></li>
        <li><a href="#pre-processing">Pre-processing</a></li>
        <li><a href="#final_dataset">Final dataset after Pre-processing</a></li>
        <li><a href="#feature_selection">Feature Selection</a></li>
        <li><a href="#classification">Classification</a></li>        
    </ol>
</div>
<br>
<hr>

<div id="import_libraries"> 
    <h2>Import Libraries</h2>    
</div>

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
import seaborn as sns  
import matplotlib.pyplot as plt  
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression  
from sklearn.feature_selection import RFE 
from sklearn.feature_selection import mutual_info_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

<div id="import_dataset"> 
    <h2>Import "Pima Indians Diabetes" Dataset</h2>         
</div>

**About the dataset :**
<ul>
       <li> The "Pima Indians Diabetes" dataset includes medical data for 768 women of Pima Indian descent, aimed at predicting diabetes onset. It consists of features such as the number of pregnancies, glucose levels, blood pressure, skin thickness, insulin levels, body mass index (BMI), diabetes pedigree function, and age. The target variable indicates whether an individual has diabetes (1) or not (0). 
        <br>
        <br>
        <li> This dataset is commonly used in machine learning for developing predictive models and understanding diabetes risk factors.
        <br>
        <br>
        <li> By analyzing this dataset, researchers can identify significant predictors of diabetes, enhancing early detection and informing healthcare strategies for prevention and intervention in at-risk populations. 
</ul> 

In [None]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"  
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',  
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']  
pid_df = pd.read_csv(url, header=None, names=column_names)
display(pid_df)

<div id="information"> 
    <h2>Information about the Dataset</h2>    
</div>

In [None]:
# Show summary statistics for the dataset
# This includes count, mean, standard deviation, minimum, 25%, 50%, 75%, and maximum values for numeric columns
print('\nThe dataset description:\n')

data_describe = pid_df.describe()
display(data_describe)

In [None]:
# Display a concise summary of the dataset
# This summary includes the index dtype, column dtypes, non-null values, and memory usage 
print('\nMore information about the dataset:\n')

data_information = pid_df.info()
display(data_information)

In [None]:
# Get the shape of the dataset, which returns the number of rows and columns
shape_of_the_dataset = pid_df.shape
print("\nThe shape of the dataset -->", shape_of_the_dataset)

In [None]:
# Calculate the number of unique values in each column of the dataset
print('\nNumber of unique data in the dataset:\n')

unique_data = pid_df.nunique()
print(unique_data)


<div id="pre-processing"> 
    <h2>Pre-processing</h2>    
</div>
<div>
    <ol>
        <li><a href="#duplicates">Duplicate Tuples</a></li>
        <li><a href="#outliers">Detecting Outliers (Noise)</a></li>
        <li><a href="#missing_values">Handling Missing Values</a></li>
        <li><a href="#standardization">Standardization</a></li>
        <li><a href="#data_imbalance">Handling Data Imbalance</a></li>      
    </ol>
</div>
<br>
<hr>

<div id="duplicates"> 
    <h2>Duplicate Tuples</h2>    
</div>

In [None]:
# Calculate the number of duplicate rows in the dataframe
Num_of_duplicate_rows = pid_df.duplicated().sum()
print("\nThe number of duplicate rows -->", Num_of_duplicate_rows)

<div id="outliers"> 
    <h2>Detecting Outliers (Noise)</h2>    
</div>
<div>
    <ol>
        <li><a href="#iqr">Interquartile Range (IQR) method</a></li>          
    </ol>
</div>
<br>
<hr>

<div id="iqr"> 
    <h2>Interquartile Range (IQR) method</h2>    
</div>

In [8]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)  
Q1 = pid_df.quantile(0.25)  
Q3 = pid_df.quantile(0.75)  
IQR = Q3 - Q1  

# Define the outlier detection bounds  
lower_bound = Q1 - 1.5 * IQR  
upper_bound = Q3 + 1.5 * IQR  

In [None]:
# Create a mask to filter out rows with outliers  
outlier_mask = ~((pid_df < lower_bound) |   
                 (pid_df > upper_bound)).any(axis=1)  

# Create a new dataframe after outlier detection and deleting
df_iqr = pid_df[outlier_mask]  
display(df_iqr)

In [10]:
# Validate the IQR method
# Separate features and target variable  
x = df_iqr.drop('Outcome', axis=1)            # Features
y = df_iqr['Outcome']                         # Target variable

# Split the data into training and testing sets (80/20) 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)  

In [None]:
# Initialize the KNN classifier  
clf_iqr = KNeighborsClassifier(n_neighbors=1)  

# Perform cross-validation to check accuracy after IQR outlier removal  
accuracy = np.mean(cross_val_score(clf_iqr, x_train, y_train, scoring='accuracy', cv=10))  
print(f'\nCross-validated accuracy after IQR outlier removal: {accuracy:.4f}\n')

In [None]:
print("\nDataset shape before deleting the outliers -->", pid_df.shape)
print("\nDataset shape after deleting the outliers -->", df_iqr.shape)
print("\n")

<div id="missing_values"> 
    <h2>Handling Missing Values</h2>    
</div>
<div>
    <ol>
        <li><a href="#mean">Mean Imputation</a></li>
        <li><a href="#iterative">Iterative Imputation</a></li>
        <li><a href="#knn">K-Nearest Neighbors (KNN) Imputation</a></li>
        <li><a href="#output">Output the results</a></li>    
    </ol>
</div>
<br>
<hr>

In [None]:
# Show summary statistics after detecting outliers
# This includes count, mean, standard deviation, minimum, 25%, 50%, 75%, and maximum values for numeric columns
print('\nThe data set description after detecting outliers:\n')
display(df_iqr.describe())

In [None]:
# Check for missing values in the dataframe
isna = pd.DataFrame(df_iqr.isna().sum(axis=0))
print(isna)

In [None]:
print('\nThere are no NaN values in the dataset \n')
print('\nBut according to the description, some variables cannot be zero. So they must be handled')
print('They are --> SkinThickness and Insulin \n')

In [None]:
# Replace zero values with NaN in specified columns 
temp= ['SkinThickness', 'Insulin']
df_iqr[temp]= df_iqr[temp].replace(0, np.nan)

# Check for missing values after replacement
isna= pd.DataFrame(df_iqr.isna().sum(axis=0))
print(isna)

<div id="mean"> 
    <h2>Mean Imputation</h2>    
</div>

In [17]:
# Create a copy of the original dataframe for mean imputation  
DfMean = df_iqr.copy(deep=True)  

# Initialize the simpleimputer for mean imputation  
MeanImputer = SimpleImputer(missing_values=np.nan, strategy='mean')  

In [None]:
# Apply mean imputation   
DfMean.iloc[:, :] = MeanImputer.fit_transform(DfMean)  

# Preview the data after mean imputation  
print('\nPreview the data after mean imputation: \n')
display(DfMean.head())

<div id="iterative"> 
    <h2>Iterative Imputation</h2>    
</div>

In [19]:
# Create a copy of the original dataset  
DfIterative = df_iqr.copy(deep=True)  

# Set up the iterative imputer
imputer_ite = IterativeImputer(missing_values=np.nan, sample_posterior=True, min_value=0,
                                            random_state=0)

In [None]:
# Perfore the imputation
DfIterative.iloc[:, :] = imputer_ite.fit_transform(DfIterative)

# Preview the data after iterative imputation
print('\nPreview the data after iterative imputation: \n')
display(DfIterative.head())               

<div id="knn"> 
		<h2>K-Nearest Neighbors (KNN) Imputation</h2>    
</div>

In [None]:
# Create a copy of the original dataset  
Df_knn = df_iqr.copy() 

# Initialize the KNN imputer  
imputer_knn = KNNImputer(n_neighbors=5)

In [None]:
# Fit the imputer and transform the dataset  
imputed_data_knn = imputer_knn.fit_transform(Df_knn)

# Convert back to dataframe  
Df_imputed_knn = pd.DataFrame(imputed_data_knn, columns=Df_knn.columns)

In [None]:
# Preview the data after KNN imputation
print('\nPreview the data after KNN imputation: \n')
display(Df_imputed_knn.head())

<div id="output"> 
    <h2>Output the results</h2>    
</div>

Compare the different Imputation Methods using **Kernel Density Estimation (KDE) Plots**

In [None]:
# 'SkinThickness' column
# Setup the plotting environment  
plt.figure(figsize=(14, 10))  

# KDE for 'SkinThickness' column  
sns.kdeplot(df_iqr['SkinThickness'], label='Baseline', fill=False, bw_adjust=0.5)  
sns.kdeplot(DfMean['SkinThickness'], label='Mean Imputation', fill=False, bw_adjust=0.5)  
sns.kdeplot(DfIterative['SkinThickness'], label='Iterative Imputation', fill=False, bw_adjust=0.5)
sns.kdeplot(Df_imputed_knn['SkinThickness'], label='KNN Imputation', fill=False, bw_adjust=0.5) 

# Aesthetic aspects of the plot  
plt.title('KDE Plot comparison of SkinThickness across Imputation Methods')  
plt.xlabel('SkinThickness')  
plt.ylabel('Density')  
plt.legend()  
plt.grid(True)  
plt.show()  

In [None]:
# 'Insulin' column
# Setup the plotting environment  
plt.figure(figsize=(14, 10))  

# KDE for 'Insulin' column
sns.kdeplot(df_iqr['Insulin'], label='Baseline', fill=False, bw_adjust=0.5)  
sns.kdeplot(DfMean['Insulin'], label='Mean Imputation', fill=False, bw_adjust=0.5)     
sns.kdeplot(DfIterative['Insulin'], label='Iterative Imputation', fill=False, bw_adjust=0.5)
sns.kdeplot(Df_imputed_knn['Insulin'], label='KNN Imputation', fill=False, bw_adjust=0.5) 

# Aesthetic aspects of the plot
plt.title('KDE Plot comparison of Insulin across Imputation Methods')  
plt.xlabel('Insulin')  
plt.ylabel('Density')  
plt.legend()  
plt.grid(True)  
plt.show()

In [None]:
print("\nContinue working with iterative imputation after comparing different imputation methods:\n")
display(DfIterative)


<div id="standardization"> 
    <h2>Standardization</h2>    
</div>
<div>
    <ol>
        <li><a href="#z-score">Z-Score Standardization (Standard Scaling)</a></li>
        <li><a href="#min-max">Min-Max Scaling (Normalization)</a></li> 
        <li><a href="#output">Output the results</a></li>     
    </ol>
</div>
<br>
<hr>


<div id="z-score"> 
    <h2>Z-Score Standardization (Standard Scaling)</h2>    
</div>

In [24]:
# Apply the Z-score standardization
Z_scaler = StandardScaler()  
Z_Scaled = Z_scaler.fit_transform(DfIterative)

# Create a new dataframe with the scaled data  
df_Z_Scaled = pd.DataFrame(Z_Scaled, columns = list(DfIterative.columns))

In [None]:
# Use all columns except 'Outcome'
df_Z_Scaled_final = df_Z_Scaled.drop('Outcome', axis = 1)

# Add the 'Outcome' column back to the dataframe
df_Z_Scaled_final['Outcome'] = DfIterative['Outcome'].tolist()
display(df_Z_Scaled_final.head())

In [26]:
# Validate the Z-score standardization
# Separate features and target variable  
x_z = df_Z_Scaled_final.drop('Outcome', axis = 1)               # Features
y_z = df_Z_Scaled_final['Outcome']                              # Target variable

# Split the data into training and testing sets (80/20)  
x_train_z, x_test_z, y_train_z, y_test_z = train_test_split(x_z, y_z, test_size=0.2, random_state=0)  

In [None]:
# Initialize the KNN classifier  
clf_z = KNeighborsClassifier(n_neighbors=10)  

# Perform cross-validation to check accuracy after the Z-standard scaling  
accuracy_z = np.mean(cross_val_score(clf_z, x_train_z, y_train_z, scoring='accuracy', cv=10)) 
print(f'\nCross-validated accuracy after the Z-standard scaling: {accuracy_z:.4f}\n')

<div id="min-max"> 
    <h2>Min-Max Scaling (Normalization)</h2>    
</div>

In [None]:
# Apply Min Max scaler
MM_scaler = MinMaxScaler()
Min_Max_Scaled = MM_scaler.fit_transform(DfIterative)

# Create a new dataframe with the scaled data 
df_Min_Max_Scaled_final = pd.DataFrame(Min_Max_Scaled, columns = list(DfIterative.columns))
display(df_Min_Max_Scaled_final.head())

In [29]:
# Validate the Min Max Scaler
# Separate features and target variable  
x_mm = df_Min_Max_Scaled_final.drop('Outcome', axis = 1)            # Features
y_mm = df_Min_Max_Scaled_final['Outcome']                           # Target variable

# Split the data into training and testing sets (80/20) 
x_train_mm, x_test_mm, y_train_mm, y_test_mm = train_test_split(x_mm, y_mm, test_size=0.2, random_state=0) 

In [None]:
# Initialize the KNN classifier  
clf_mm = KNeighborsClassifier(n_neighbors=10)  

# Perform cross-validation to check accuracy after the Min Max scaling  
accuracy_mm = np.mean(cross_val_score(clf_mm, x_train_mm, y_train_mm, scoring='accuracy', cv=10))
print(f'\nCross-validated accuracy after the Min Max scaling: {accuracy_mm:.4f}\n')

<div id="output"> 
    <h2>Output the results</h2>    
</div>

In [None]:
# Output the results of different validation methods   
print('\nZ-standard scaling result:', accuracy_z)  # Print accuracy score for the z-score standardization method  
print('\nMin Max scaling result:', accuracy_mm)    # Print accuracy score for the min max scaler method 

In [None]:
print("\nContinue working with the dataset scaled by the Min Max Scaler after comparing different scaling methods:\n")
df_Scaled = df_Min_Max_Scaled_final
display(df_Scaled.head())

<div id="data_imbalance"> 
    <h2>Handling Data Imbalance</h2>    
</div>
<div>
    <ol>
        <li><a href="#oversampling">Oversampling</a></li>
        <li><a href="#undersampling">Undersampling</a></li> 
        <li><a href="#output">Output the results</a></li>   
    </ol>
</div>
<br>
<hr>

In [None]:
# Check the distribution of the 'Outcome' variable
outcome_counts = df_Scaled['Outcome'].value_counts()  
print("Outcome distribution:\n", outcome_counts)

In [34]:
# Separate features and target variable 
X = df_Scaled.drop('Outcome', axis = 1)             # Features
y = df_Scaled['Outcome']                            # Target variable

# Split the data into training and testing sets (80/20)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

<div id="oversampling"> 
    <h2>Oversampling</h2>    
</div>

In [None]:
# Initialize SMOTE 
sm = SMOTE()

# Print class distribution before oversampling
print("\nClass 1 before oversampling --> ", sum(Y_train == 1))
print("\nClass 0 before oversampling --> ", sum(Y_train == 0))

In [36]:
# Apply SMOTE to the training data
# X_train after oversampling --> X_train_OS
# Y_train after oversampling --> Y_train_OS
X_train_OS, Y_train_OS = sm.fit_resample(X_train, Y_train)

In [None]:
# Print the shapes of the datasets after oversampling
print("\nThe shape of X after oversampling -->", X_train_OS.shape)
print("\nThe shape of Y after oversampling -->", Y_train_OS.shape)

In [None]:
# Print class distribution after oversampling 
print("\nClass 1 after oversampling --> ", sum(Y_train_OS == 1))
print("\nClass 0 after oversampling --> ", sum(Y_train_OS == 0))
print("\n")

In [None]:
# Validate oversampling
# Initialize the KNN classifier  
clf_os = KNeighborsClassifier(n_neighbors=1)  

# Perform cross-validation to check accuracy after oversampling  
accuracy_os = np.mean(cross_val_score(clf_os, X_train_OS, Y_train_OS, scoring='accuracy', cv=10))
print(f'\nCross-validated accuracy after oversampling: {accuracy_os:.4f}\n')

<div id="undersampling"> 
    <h2>Undersampling</h2>    
</div>

In [None]:
# Initialize NearMiss  
nr = NearMiss()  

# Print class distribution before undersampling  
print("\nClass 1 before undersampling --> ", sum(Y_train == 1))  
print("\nClass 0 before undersampling --> ", sum(Y_train == 0))    

In [41]:
# Apply NearMiss to the training data  
# X_train after undersampling --> X_train_US
# Y_train after undersampling --> Y_train_US
X_train_US, Y_train_US = nr.fit_resample(X_train, Y_train) 

In [None]:
# Print the shapes of the datasets after undersampling  
print("\nThe shape of X after undersampling -->", X_train_US.shape)  
print("\nThe shape of Y after undersampling -->", Y_train_US.shape)  

In [None]:
# Print class distribution after undersampling  
print("\nClass 1 after undersampling --> ", sum(Y_train_US == 1))  
print("\nClass 0 after undersampling --> ", sum(Y_train_US == 0))  
print("\n")

In [None]:
# Validate undersampling
# Initialize the KNN classifier  
clf_us = KNeighborsClassifier(n_neighbors=1)  

# Perform cross-validation to check accuracy after undersampling  
accuracy_us = np.mean(cross_val_score(clf_us, X_train_US, Y_train_US, scoring='accuracy', cv=10))
print(f'\nCross-validated zacuracy after undersampling: {accuracy_us:.4f}\n')

<div id="output"> 
    <h2>Output the results</h2>    
</div>

In [None]:
# Output the results of different handling imbalanced data methods   
print('\nOversampling result:', accuracy_os)     # Print accuracy score for oversampling method  
print('\nUndersampling result:', accuracy_us)    # Print accuracy score for undersampling method 

In [None]:
print("\nContinue working with the dataset handled by oversampling after comparing different methods\n")

<div id="final_dataset"> 
    <h2>Final dataset after Pre-processing</h2>    
</div>

In [None]:
# Training set
X = pd.DataFrame(X_train_OS)
Y = pd.DataFrame(Y_train_OS, columns = ['Outcome'])

# Combine features and target for the training set
df_train_final = pd.concat([X, Y], axis = 'columns')
display(df_train_final.head())              # Preview the training set

In [None]:
# Testing set
# Combine features and target for the testing set
df_test_final = pd.concat([X_test, Y_test], axis = 'columns')
display(df_test_final.head())              # Preview the testing set 

In [None]:
# The final training and test datasets
print('\nThe shape of training dataset -->', df_train_final.shape)
print('\nThe shape of testing dataset -->', df_test_final.shape)

In [None]:
# The training and testing set together
# df_final_adp --> df_final_after data preprocessing
df_final_adp = pd.concat([df_train_final,df_test_final])      
display(df_final_adp.head())                 # Preview the combined dataset

In [None]:
print('\nThe shape of the combined dataset -->', df_final_adp.shape)
print('\n')

<div id="feature_selection"> 
    <h2>Feature Selection</h2>    
</div>
<div>
    <ol>
        <li><a href="#fm">Filter Method (Correlation Analysis)</a></li>
        <li><a href="#rfe">Recursive Feature Elimination (RFE)</a></li>         
        <li><a href="#output">Output the results</a></li> 		
    </ol>
</div>
<br>
<hr>

<div id="fm"> 
    <h2>Filter Method (Correlation Analysis)</h2>    
</div>

In [None]:
# Calculate correlation matrix
corr = df_final_adp.corr()
print('\nCorrelation between the features in the dataset:\n')

# Display the correlation matrix
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    display(corr)

In [None]:
# Visualize the correlation matrix using a heatmap plot
# Setup the plotting environment 
plt.figure(figsize=(15,15))
print('\nVisualizing the correlation of the dataset:\n')

# Heatmap plot for correlation
sns.heatmap(corr, cbar=True, square= True, fmt='.2f', annot=True, annot_kws={'size':10}, cmap='Blues')

# Aesthetic aspects of the plot
plt.title('Feature Correlation Heatmap', fontsize=18)  
plt.show() 

In [None]:
# Get and print correlation of 'Outcome' with other features
print("\nThe correlation of 'Outcome' with other features:\n")
outcome_corr = df_final_adp.corr()['Outcome'].sort_values(ascending=False)  
print(outcome_corr) 

In [None]:
# Select features with correlation >= 0.2 with 'Outcome'
significant_features_fm = outcome_corr[outcome_corr >= 0.2].index.tolist()  

# Remove 'Outcome' from the list of significant features
significant_features_fm = [feature for feature in significant_features_fm if feature != 'Outcome']           
print("\nChoosing features that have correlation >= 0.2':\n", significant_features_fm) 

In [56]:
# Validate filter method
# Separate features and target variable
x_fm = df_final_adp[significant_features_fm].drop(columns=['Outcome'], errors='ignore')               # Features 
y_fm = df_final_adp['Outcome']                                                                        # Target variable

In [None]:
# Initialize the KNN classifier
clf_fm = KNeighborsClassifier(n_neighbors=1)  

# Perform cross-validation to check accuracy after filter method 
accuracy_fm = np.mean(cross_val_score(clf_fm, x_fm, y_fm, scoring='accuracy', cv=10))  
print(f"\nCross-validated accuracy after filter method: {accuracy_fm:.4f}") 

<div id="rfe"> 
    <h2>Recursive Feature Elimination (RFE)</h2>    
</div>

In [58]:
# Separate features and target variable  
X = df_final_adp.drop('Outcome', axis=1)           # Features
y = df_final_adp['Outcome']                        # Target variable

# Split the dataset into training and testing sets (80/20) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Initialize the Logistic Regression model  
reg_model = LogisticRegression(max_iter=1000)                   # Added max_iter for convergence if needed  

# Initialize and fit RFE  
rfe = RFE(estimator=reg_model, n_features_to_select=6)          # Select 6 features  

# Fit the model to the training data
rfe.fit(X_train, y_train)

In [None]:
# Get the selected features  
significant_features_rfe = X.columns[rfe.support_]  
print("Selected features using RFE:")  
print(significant_features_rfe.tolist()) 

In [61]:
# Validate RFE
# Separate features and target variable
X_rfe = df_final_adp[significant_features_rfe]           # Features
y = df_final_adp['Outcome']                              # Target variable

In [None]:
# Initialize the KNN classifier
clf_rfe = KNeighborsClassifier(n_neighbors = 1)

# Perform cross-validation to check accuracy after RFE
accuracy_rfe = np.mean(cross_val_score(clf_rfe, X_rfe, y, scoring='accuracy', cv=10))
print(f"\nCross-validated accuracy after RFE: {accuracy_rfe:.4f}")

<div id="output"> 
    <h2>Output the results</h2>    
</div>  

In [None]:
# Output the results of different feature selection methods   
print('\nFilter method result:', accuracy_fm)     # Print accuracy score for filter (correlation analysis) method  
print('\nRFE result:', accuracy_rfe)              # Print accuracy score for recursive feature elimination (RFE) method

In [None]:
print('\nUsing features that obtained from Recursive Feature Elimination (RFE) because it had better accuracy \n')

**Final dataset** after feature selection

In [None]:
# Final dataset after feature selection (Recursive Feature Elimination (RFE))
# Extract the names of the selected features 
print('\nselected features:\n', significant_features_rfe)

In [None]:
# Create the final dataset with the selected features and add the target column
df_final = df_final_adp[significant_features_rfe] 
df_final['Outcome'] = df_final_adp['Outcome']

# Display the final dataset
print("Final dataset with selected features and target column:") 
display(df_final)

<div id="classification"> 
    <h2>Classification</h2>    
</div>
<div>
    <ol>
        <li><a href="#nb">Naive Bayes</a></li>   
    </ol>
</div>
<br>
<hr>

In [67]:
# Separate features and target variable  
X = df_final.drop('Outcome', axis=1)              # Features  
y = df_final['Outcome']                           # Target variable

# Split the dataset into training and testing sets (80/20) 
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print('\nThe shape of the X_train dataset -->', X_train.shape)
print('\nThe shape of the Y_train dataset -->', Y_train.shape)
print('\nThe shape of the X_test dataset -->', X_test.shape)
print('\nThe shape of the Y_test dataset -->', Y_test.shape)
print('\n')

<div id="nb">   
    <h2>Naive Bayes</h2>    
</div>  
<div>  
    <ol>  
        <li>  
            <a href="#valid">Validating</a>  
            <ol>   
                <li><a href="#holdout">Holdout</a></li>   
                <li><a href="#rrs">Repeated Random Sampling</a></li>    
            </ol>  
        </li>  
        <li><a href="#test">Testing</a></li>  
        <li><a href="#roc">ROC plot and AUC score</a></li> 
        <li><a href="#output">Output the results</a></li> 
    </ol>  
</div>  
<br>  
<hr>

<div id="holdout"> 
    <h2>Holdout</h2>    
</div>

In [None]:
# Holdout 
# Split the dataset into training and validating sets (80/20)
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)

# Train a Naive Bayes classifier
clf_nb_h = GaussianNB()

# Fit the model to the training data
clf_nb_h.fit(x_train, y_train)

In [None]:
# Predict the labels for the validating data
y_predict = clf_nb_h.predict(x_val)

# Evaluate model performance
print('\nHoldout result:')
accuracy_score_holdout = accuracy_score(y_val, y_predict)
print('\nAccuracy  -->', accuracy_score_holdout)
print('\n')

<div id="rrs"> 
    <h2>Repeated Random Sampling</h2>    
</div>

In [71]:
# Repeated random sampling

Accuracy = []             # Initialize a list to store accuracy results
num_repeats = 10          # Number of times to repeat random sampling

# Perform repeated random sampling
for i in range(num_repeats):
    
    # Split the dataset into training and validating sets (80/20)
    x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)

    # Train a Naive Bayes classifier
    clf_nb_rrs = GaussianNB()

    # Fit the model to the training data
    clf_nb_rrs.fit(x_train, y_train)

    # Predict the labels for the validating data
    y_predict = clf_nb_rrs.predict(x_val)
    accuracy_score(y_val, y_predict)
    Accuracy.append(accuracy_score(y_val, y_predict))

In [None]:
# Evaluate model performance        
df_Accuracy = pd.DataFrame(Accuracy, columns=['Accuracy'])
print('\nAccuracy in 10 iterations for different train and validation sets:\n')
display(df_Accuracy)
accuracy_score_rrs = df_Accuracy.Accuracy.mean()
print('\nThe mean of different accuracies for validating the model -->', accuracy_score_rrs)
print('\n')

<div id="test"> 
    <h2>Testing</h2>    
</div>

In [73]:
# Testing
# Train a Naive Bayes classifier
clf_nb = GaussianNB()

# Fit the model to the training data
clf_nb.fit(X_train, Y_train)

# Predict the labels for the testing data
Y_predict = clf_nb.predict(X_test)

In [None]:
# Evaluate model performance
print('\nTesting the model:\n')

accuracy_score_nb_testing = accuracy_score(Y_test, Y_predict)
print('\nAccuracy  -->', accuracy_score_nb_testing)
print('\nRecall or Sensitivity or TPR --->', recall_score(Y_test, Y_predict))
print('\nPrecision -->', precision_score(Y_test, Y_predict))
print('\nF1_score -->', f1_score(Y_test, Y_predict))
print('\n')

In [None]:
# Generate and display the classification report
print('\nClassification report:\n', classification_report(Y_test, Y_predict))

In [None]:
# Generate and display the confusion matrix
confusion_matrix = metrics.confusion_matrix(Y_predict, Y_test)

# Create a dataframe for the confusion matrix for better visualization
confusion_matrix_dataframe = pd.DataFrame(confusion_matrix, columns = ['benign present', 'malignant present'], 
                                                            index = ['test benign', 'test malignant'])
print("\nConfusion matrix:\n")
display(confusion_matrix_dataframe)
print('\n')

<div id="roc"> 
    <h2>ROC plot and AUC score</h2>    
</div>

In [77]:
# ROC
def plot_roc_curve(y_test, y_prid):

    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prid)      
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [None]:
#ROC plot and AUC score
plot_roc_curve(Y_test, Y_predict)

roc_auc_score = roc_auc_score(Y_test, Y_predict)
print('\nAUC score:', roc_auc_score)
print('\n')

<div id="output"> 
    <h2>Output the results</h2>    
</div>

In [None]:
# Output the results of different validation methods and the Naive Bayes testing  
print('\nHoldout result:', accuracy_score_holdout)                       # Print accuracy score for the holdout method  
print('\nRepeated random sampling result:', accuracy_score_rrs)          # Print accuracy score for repeated random sampling method  
print('\nNaive Bayes testing result:', accuracy_score_nb_testing)        # Print accuracy score for naive bayes testing  
print('\nAUC score:', roc_auc_score)                                     # Print AUC score for the model