# Testing a model using neural networks

In [162]:
# Packages used for developing models, as well as processing information

from IPython import get_ipython
from IPython.display import display
# %%
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from warnings import filterwarnings
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer

filterwarnings('ignore')

In [178]:

#Reading the database and checking some information
#I purposely eliminated some information from the data set so that I could apply methods to fill
#in empty fields and transform text data into numeric values.

Dataset = pd.read_csv('breast_cancer.csv',sep=';', on_bad_lines='skip')
#Dataset.info()
#Dataset.describe()

# Checking Some Measurements

In [None]:
sns.set(font_scale=1.5,rc={'figure.figsize':(20,20)}) #usando a biblioteca sns posso verificar algumas distribuições dos meus dados
eixo=Dataset.hist(bins=20,color='red')

# Processing Some Variables

In [179]:
class processing_data:
  def __init__(self,base=None,label='Processing Dataset'):
    self.base = base
    self.label = label

  # select objetc columns and transform into float values
  def float_values(self,str_var):
    textos =self.base.select_dtypes(include=['object']).columns
    for column in textos:
      if column not in str_var:
        self.base[column] = self.base[column].str.replace(',', '.').astype(float)
    self.base.select_dtypes(include=['object']).columns
    return self.base

  #if my target is a text I transform in dummy value
  def target_variable(self,target,variavel):
    #base = pd.Series(base)
    self.base[target] = np.where(self.base[target]==variavel, 1, 0)
    return self.base

  # transform in dummies some variables
  def dummy_df(self,X,dummies):
      X = pd.get_dummies(X, prefix=dummies, columns=dummies,dtype='int')
      return X

  def normalize_df(self,X,Lista_variaveis):
    #Normalize the data
    X =pd.DataFrame(data=X, columns=Lista_variaveis)
    min_max_scaler =MinMaxScaler()
    X = min_max_scaler.fit_transform(X)
    return X



In [180]:
processor = processing_data(base=Dataset)
processor.float_values(['Family Case','target'])
processor.target_variable('target','Malignant')
processor.dummy_df(Dataset,['Family Case'])
processor.base=processor.dummy_df(processor.base,['Family Case'])

column_to_move = processor.base.pop("target")
processor.base['target'] = column_to_move

X=processor.base.iloc[:,:-1].values
y=processor.base.iloc[:,-1].values


In [181]:
#In this step I select my data set and transform other text variables into numeric ones to facilitate the processing of my model.
# Get the indices of columns with missing values
missing_cols_indices = [Dataset.columns.get_loc(col) for col in Dataset.columns[Dataset.isna().any()]]

# Impute missing values using these indices
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X[:, missing_cols_indices])
X[:, missing_cols_indices] = imputer.transform(X[:, missing_cols_indices])


In [169]:
Lista_variaveis=Lista_variaveis = processor.base.columns[:-1]
X=processor.normalize_df(X,Lista_variaveis)


# Model Development and Results

In [172]:
#Neural Network Model

Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=0)
# Convert ytrain and ytest to integer labels before fitting
ytrain = ytrain.astype(int) # Convert to integer type to avoid "unknown label type" error.
ytest = ytest.astype(int) # Convert to integer type to avoid "unknown label type" error.

num_neu= len(Lista_variaveis)
targ=2
ocult_neu=int((num_neu*(2/3))+2)

neuro=tf.keras.models.Sequential([tf.keras.layers.Dense(num_neu,input_shape=(len(Lista_variaveis),)
                                               ,activation='relu',kernel_initializer='he_normal'),
                           tf.keras.layers.Dropout(0.5),
                           tf.keras.layers.Dense(ocult_neu,activation='relu',kernel_initializer='he_normal'),
                           tf.keras.layers.Dropout(0.5),
                           tf.keras.layers.Dense(2, activation='softmax')])

In [173]:
#Train setings
neuro.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), #or binary_crossentropy
              metrics=['accuracy'])

In [174]:
BATCH_SIZE = 4
EPOCHS = 15

In [175]:
neuro.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=EPOCHS)

Epoch 1/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5054 - loss: 0.7777
Epoch 2/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6166 - loss: 0.6835
Epoch 3/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7615 - loss: 0.5296
Epoch 4/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7987 - loss: 0.4380
Epoch 5/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8273 - loss: 0.3946
Epoch 6/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8562 - loss: 0.3512
Epoch 7/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8409 - loss: 0.3411
Epoch 8/15
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9072 - loss: 0.2709
Epoch 9/15
[1m114/114[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7b76626e3250>

In [176]:
neuro.evaluate(Xtest, ytest)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9787 - loss: 0.0753  


[0.10502660274505615, 0.9649122953414917]

In [177]:
probs = np.round(neuro.predict(Xtest, verbose=0)[:,1], 7)
fpr, tpr, thresholds = roc_curve(ytest, probs)
#Performs the two-sample Kolmogorov-Smirnov test for goodness of fit.
#This test compares the underlying continuous distributions F(x) and G(x) of two independent samples
print('KS  Validation: {0:.2f}% e AUC: {1:.2f}%'.format(stats.ks_2samp(fpr, tpr)[0] * 100.0, auc(fpr, tpr) * 100))
print((probs.max(),probs.min()))

probs = np.round(neuro.predict(Xtrain, verbose=0)[:,1], 7)
fpr, tpr, thresholds = roc_curve(ytrain, probs)
print('KS Development: {0:.2f}% e AUC: {1:.2f}%'.format(stats.ks_2samp(fpr, tpr)[0] * 100.0, auc(fpr, tpr) * 100))
print((probs.max(),probs.min()))



# Define a scoring function for permutation_importance
def scoring_fn(estimator, X, y):
    y_pred = np.argmax(estimator.predict(X, verbose=0), axis=1)  # Get predicted classes
    return accuracy_score(y, y_pred)  # Calculate accuracy

# Calculate permutation feature importance using the scoring function
result = permutation_importance(
    neuro, Xtest, ytest, n_repeats=10, random_state=0, scoring=scoring_fn
)
# Create a DataFrame to store the results
feature_importances = pd.DataFrame(
    {
        "feature": Lista_variaveis,  # Assuming Lista_variaveis contains feature names
        "importance": result.importances_mean,
    }
).sort_values("importance", ascending=False)


print(feature_importances) # Print the top 5 most important features or change the number to see others

KS  Validation: 66.67% e AUC: 99.36%
(np.float32(0.9999983), np.float32(1e-07))
KS Development: 80.00% e AUC: 99.39%
(np.float32(0.9999956), np.float32(3e-07))
                       feature  importance
30              Family Case_no    0.078947
6               mean concavity    0.028947
27        worst concave points    0.019298
0                  mean radius    0.014035
32             Family Case_yes    0.013158
7          mean concave points    0.011404
23                  worst area    0.009649
15           compactness error    0.007895
28              worst symmetry    0.006140
29     worst fractal dimension    0.005263
22             worst perimeter    0.004386
31  Family Case_no information    0.004386
12             perimeter error    0.004386
9       mean fractal dimension    0.002632
10                radius error    0.001754
8                mean symmetry    0.001754
13                  area error    0.001754
3                    mean area    0.000877
19     fractal dimensio

In [None]:
y_pred=neuro.predict(X, verbose=0)
y_pred = [np.argmax(v) for v in y_pred]
X1 = min_max_scaler.inverse_transform(X)
df = pd.DataFrame(data=X1, columns=Lista_variaveis)
df2 = pd.DataFrame(data=y,columns=['Tarq'])
df3 = pd.DataFrame(data=y_pred,columns=['Pred'])
df = pd.concat([df,df2,df3],axis=1)
df

# Testing Pipelines
