# Testing a model using neural networks

In [None]:
# Packages used for developing models, as well as processing information

from IPython import get_ipython
from IPython.display import display
# %%
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from warnings import filterwarnings
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer

filterwarnings('ignore')

In [None]:

#Reading the database and checking some information
#I purposely eliminated some information from the data set so that I could apply methods to fill
#in empty fields and transform text data into numeric values.

Dataset = pd.read_csv('breast_cancer.csv',sep=';', on_bad_lines='skip')
#Dataset.info()
#Dataset.describe()

# Checking Some Measurements

In [None]:
sns.set(font_scale=1.5,rc={'figure.figsize':(20,20)}) #usando a biblioteca sns posso verificar algumas distribuições dos meus dados
eixo=Dataset.hist(bins=20,color='red')

# Processing Some Variables

In [None]:
# select objetc columns and transform into float values
def float_values(base,textos):
   #textos =base.select_dtypes(include=['object']).columns
   for column in textos:
    if column not in ['target','Family Case']:
      base[column] = base[column].str.replace(',', '.').astype(float)
   base.select_dtypes(include=['object']).columns
   return base

#if my target is a text I transform in dummy value
def target_variable(base):
  base['target'] = np.where(base['target']=='Malignant', 1, 0)
  return base

# transform in dummies some variables
def dummy_df(base, X,c):
    Lista_variaveis = []
    Lista_dummies = []

    objetos = []
    for column_name in base.columns:
        if column_name in c:
            objetos.append(column_name)

    if objetos:  # Check if objetos is not empty
        # Convert X to a DataFrame for easier manipulation
        X_df = pd.DataFrame(X, columns=base.columns[:-1])

        # Perform one-hot encoding using Pandas get_dummies
        for col_name in objetos:
            # Get the numerical index of the column
            col_index = X_df.columns.get_loc(col_name)
            dummy_df = pd.get_dummies(X_df.iloc[:, col_index], prefix=base.columns[col_index], dtype='int')

            X_df = pd.concat([X_df, dummy_df], axis=1)
            Lista_dummies.extend(dummy_df.columns)  # Add dummy column names

        # Drop original categorical columns
        X_df = X_df.drop(columns=objetos)  # Pass column names directly

        # Update Lista_variaveis with dummy columns and remaining features
        Lista_variaveis = list(X_df.columns[~X_df.columns.isin(Lista_dummies)]) + Lista_dummies
        # Convert back to NumPy array if needed
        X = X_df.values
    return X, Lista_variaveis

def normalize_df(X,Lista_variaveis):
  #Normalize the data
  X =pd.DataFrame(data=X, columns=Lista_variaveis)
  min_max_scaler =MinMaxScaler()
  X = min_max_scaler.fit_transform(X)
  return X



In [None]:
textos =Dataset.select_dtypes(include=['object']).columns
c=textos
Dataset=float_values(Dataset,textos)
Dataset=target_variable(Dataset)


X = Dataset.iloc[:,:-1].values
y = Dataset.iloc[:,-1].values

In [None]:
#In this step I select my data set and transform other text variables into numeric ones to facilitate the processing of my model.
# Get the indices of columns with missing values
missing_cols_indices = [Dataset.columns.get_loc(col) for col in Dataset.columns[Dataset.isna().any()]]

# Impute missing values using these indices
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X[:, missing_cols_indices])
X[:, missing_cols_indices] = imputer.transform(X[:, missing_cols_indices])


In [None]:
X,Lista_variaveis=dummy_df(Dataset,X,c)
X=normalize_df(X,Lista_variaveis)


# Model Development and Results

In [None]:
#Neural Network Model

Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=0)
# Convert ytrain and ytest to integer labels before fitting
ytrain = ytrain.astype(int) # Convert to integer type to avoid "unknown label type" error.
ytest = ytest.astype(int) # Convert to integer type to avoid "unknown label type" error.

num_neu= len(Lista_variaveis)
targ=2
ocult_neu=int((num_neu*(2/3))+2)

neuro=tf.keras.models.Sequential([tf.keras.layers.Dense(num_neu,input_shape=(len(Lista_variaveis),)
                                               ,activation='relu',kernel_initializer='he_normal'),
                           tf.keras.layers.Dropout(0.5),
                           tf.keras.layers.Dense(ocult_neu,activation='relu',kernel_initializer='he_normal'),
                           tf.keras.layers.Dropout(0.5),
                           tf.keras.layers.Dense(2, activation='softmax')])

In [None]:
#Train setings
neuro.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), #or binary_crossentropy
              metrics=['accuracy'])

In [None]:
BATCH_SIZE = 4
EPOCHS = 15

In [None]:
neuro.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=EPOCHS)

Epoch 1/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5218 - loss: 0.9539
Epoch 2/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6217 - loss: 0.6992
Epoch 3/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7059 - loss: 0.5618
Epoch 4/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7503 - loss: 0.5066
Epoch 5/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8649 - loss: 0.3812
Epoch 6/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8315 - loss: 0.3703
Epoch 7/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8838 - loss: 0.3126
Epoch 8/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9036 - loss: 0.2646
Epoch 9/15
[1m114/114[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7be1382a1210>

In [None]:
neuro.evaluate(Xtest, ytest)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9808 - loss: 0.0768  


[0.09611456096172333, 0.9649122953414917]

In [None]:
probs = np.round(neuro.predict(Xtest, verbose=0)[:,1], 7)
fpr, tpr, thresholds = roc_curve(ytest, probs)
#Performs the two-sample Kolmogorov-Smirnov test for goodness of fit.
#This test compares the underlying continuous distributions F(x) and G(x) of two independent samples
print('KS  Validation: {0:.2f}% e AUC: {1:.2f}%'.format(stats.ks_2samp(fpr, tpr)[0] * 100.0, auc(fpr, tpr) * 100))
print((probs.max(),probs.min()))

probs = np.round(neuro.predict(Xtrain, verbose=0)[:,1], 7)
fpr, tpr, thresholds = roc_curve(ytrain, probs)
print('KS Development: {0:.2f}% e AUC: {1:.2f}%'.format(stats.ks_2samp(fpr, tpr)[0] * 100.0, auc(fpr, tpr) * 100))
print((probs.max(),probs.min()))



# Define a scoring function for permutation_importance
def scoring_fn(estimator, X, y):
    y_pred = np.argmax(estimator.predict(X, verbose=0), axis=1)  # Get predicted classes
    return accuracy_score(y, y_pred)  # Calculate accuracy

# Calculate permutation feature importance using the scoring function
result = permutation_importance(
    neuro, Xtest, ytest, n_repeats=10, random_state=0, scoring=scoring_fn
)
# Create a DataFrame to store the results
feature_importances = pd.DataFrame(
    {
        "feature": Lista_variaveis,  # Assuming Lista_variaveis contains feature names
        "importance": result.importances_mean,
    }
).sort_values("importance", ascending=False)


print(feature_importances) # Print the top 5 most important features or change the number to see others

In [None]:
y_pred=neuro.predict(X, verbose=0)
y_pred = [np.argmax(v) for v in y_pred]
X1 = min_max_scaler.inverse_transform(X)
df = pd.DataFrame(data=X1, columns=Lista_variaveis)
df2 = pd.DataFrame(data=y,columns=['Tarq'])
df3 = pd.DataFrame(data=y_pred,columns=['Pred'])
df = pd.concat([df,df2,df3],axis=1)
df

# Testing Pipelines
