In [1]:
# Dependencies and setup
import pandas as pd
from pathlib import Path
# from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

In [2]:
# File to load
to_load = Path("cancer_patient_datasets.csv")

In [3]:
# Create the dataframe
lung_df = pd.read_csv(to_load)
lung_df.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [4]:
# check the data types
lung_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   index                     1000 non-null   int64 
 1   Patient Id                1000 non-null   object
 2   Age                       1000 non-null   int64 
 3   Gender                    1000 non-null   int64 
 4   Air Pollution             1000 non-null   int64 
 5   Alcohol use               1000 non-null   int64 
 6   Dust Allergy              1000 non-null   int64 
 7   OccuPational Hazards      1000 non-null   int64 
 8   Genetic Risk              1000 non-null   int64 
 9   chronic Lung Disease      1000 non-null   int64 
 10  Balanced Diet             1000 non-null   int64 
 11  Obesity                   1000 non-null   int64 
 12  Smoking                   1000 non-null   int64 
 13  Passive Smoker            1000 non-null   int64 
 14  Chest Pain               

In [5]:
# drop unnecessary columns index and Patient Id
lung_df = lung_df.drop(columns=['index', 'Patient Id'])
lung_df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


In [6]:
nn_df = lung_df


In [None]:
# transform levels to int ( fit_transform gives value 1 = low, 2 = medium, 0 = high)
# label_encoding = LabelEncoder()
# lung_df['Level'] = label_encoding.fit_transform(lung_df['Level'])

# use rename to reorder
lung_df.Level = lung_df.Level.replace("Low", 0)
lung_df.Level = lung_df.Level.replace("Medium", 1)
lung_df.Level = lung_df.Level.replace("High", 2)

lung_df.Level = lung_df.Level.astype("int64")



lung_df.head()

In [11]:
lung_df.describe()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,37.174,1.402,3.84,4.563,5.165,4.84,4.58,4.38,4.491,4.465,...,3.856,3.855,4.24,3.777,3.746,3.923,3.536,3.853,2.926,1.062
std,12.005493,0.490547,2.0304,2.620477,1.980833,2.107805,2.126999,1.848518,2.135528,2.124921,...,2.244616,2.206546,2.285087,2.041921,2.270383,2.388048,1.832502,2.039007,1.474686,0.815365
min,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,27.75,1.0,2.0,2.0,4.0,3.0,2.0,3.0,2.0,3.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
50%,36.0,1.0,3.0,5.0,6.0,5.0,5.0,4.0,4.0,4.0,...,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,1.0
75%,45.0,2.0,6.0,7.0,7.0,7.0,7.0,6.0,7.0,7.0,...,5.0,6.0,6.0,5.0,5.0,5.0,5.0,6.0,4.0,2.0
max,73.0,2.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0,7.0,...,9.0,8.0,9.0,8.0,8.0,9.0,7.0,7.0,7.0,2.0


In [None]:
lung_df.hist(figsize=(12,12))
plt.show()

In [None]:
plt.figure(figsize=(16,16))
sns.heatmap(lung_df.corr(),annot=True,fmt=".0%")
plt.savefig('heatmap.png')
plt.show()

In [None]:
# Create boxplots to show any outliers
boxplot_lung_df = lung_df.drop('Level', axis = 1)
for column in boxplot_lung_df:
  plt.figure(figsize=(4, 2))
  sns.boxplot(x=boxplot_lung_df[column])
  plt.title(f"Boxplot of {column}")
  plt.show()

## Training the models

In [None]:
X = lung_df.drop('Level', axis = 1)
y = lung_df['Level'].values

In [7]:
nn_df.Level = nn_df.Level.replace("Low", 0)
nn_df.Level = nn_df.Level.replace("Medium", 1)
nn_df.Level = nn_df.Level.replace("High", 2)

nn_df.Level = nn_df.Level.astype("int64")

X = nn_df.drop("Level", axis = 1)
y = pd.get_dummies(nn_df["Level"])

In [8]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# run log regression on pre scaled models
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

ValueError: y should be a 1d array, got an array of shape (750, 3) instead.

In [None]:
# run log regression on scaled models
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [15]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','softmax'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=23))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=3, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [16]:

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 02s]
val_accuracy: 0.9399999976158142

Best val_accuracy So Far: 1.0
Total elapsed time: 00h 00m 56s


In [18]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'relu',
 'first_units': 7,
 'num_layers': 2,
 'units_0': 7,
 'units_1': 7,
 'units_2': 9,
 'units_3': 3,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 2,
 'tuner/round': 2,
 'tuner/trial_id': '0014',
 'units_4': 1,
 'units_5': 9}

In [19]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

8/8 - 0s - 9ms/step - accuracy: 0.3800 - loss: 9.4523
Loss: 9.45230484008789, Accuracy: 0.3799999952316284


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  model.build_from_config(
  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = X_train.shape[1]
hidden_nodes_layer1 =  30
hidden_nodes_layer2 = 20

nn1 = tf.keras.models.Sequential()

In [None]:
# First hidden layer
nn1.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn1.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn1.add(tf.keras.layers.Dense(units=3, activation="softmax"))

# Check the structure of the model
nn1.summary()

In [None]:
# Compile the model
nn1.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

In [None]:
# Train the model use the callback
fit_model = nn1.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn1.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file
# Define the file path for saving the model
filepath = "LungCancer.h5"

# Save the model to HDF5 format
nn1.save(filepath)

In [None]:
# K neighbors 
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))