In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop

# Load data
data = pd.read_csv('data.csv')


In [2]:
import tensorflow as tf

# List all physical devices visible to TensorFlow
# This will provide a list of all devices TensorFlow can access
print("Available devices:", tf.config.list_physical_devices())

# Specifically check for a GPU
# This will return a list of GPU devices
print("GPUs available:", tf.config.list_physical_devices('GPU'))

# Check if a GPU is being used
# This will return True if TensorFlow is able to access the GPU
print("Is GPU available:", tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))


Available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPUs available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is GPU available: True


In [3]:
# check the tensorfow version
print(tf.__version__)

2.10.0


In [4]:
tf.debugging.set_log_device_placement(True)


import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame
for column in data.columns:
    plt.figure()
    data[column].hist(bins=50)
    plt.title(column)


In [None]:
# view the columns
print(data.columns)

Index(['ClmAdmitDiagnosisCode', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
       'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10', 'Gender', 'Race',
       'RenalDiseaseIndicator', 'State', 'County', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
       'PotentialFraud', 'Age', 'WeekendAdmission', 'IsDead',
       'ClaimSettlementDelay_Cat', 'TreatmentDuration_Cat',
       'Log_TotalClaimAmount', 'Log_IPTotalAmount', 'Log_OPTotalAmount',
       'UniquePhysCount', 'IsSamePhysMultiRole1', 'IsSamePhysMultiRole2',
       'PHY412132', 'PHY337425', 'PHY330576'],
      dtype='object')


In [None]:
from sklearn.feature_selection import VarianceThreshold

# Features and labels separation
# Assuming 'target' is the column name for your target variable
X = data.drop('PotentialFraud', axis=1)
y = data['PotentialFraud']

# Fit the VarianceThreshold transformer to identify low-variance features
threshold = 0.01  # This is an example threshold, adjust it according to your needs
sel = VarianceThreshold(threshold=threshold)
sel.fit(X)

# Get features that meet the variance threshold
features_to_keep = X.columns[sel.get_support(indices=True)]

# If any features have low variance, print them
features_to_drop = set(X.columns) - set(features_to_keep)
if features_to_drop:
    print(f"Features to drop due to low variance: {features_to_drop}")


Features to drop due to low variance: {'ClmDiagnosisCode_10', 'PHY412132', 'PHY330576', 'PHY337425', 'IsDead'}


In [None]:
# drop the columns with low variance
data.drop(features_to_drop, axis=1, inplace=True)

Dropping features with low variance is a technique used in feature selection to improve the performance of a model. The rationale behind this approach is based on the assumption that features with low variance carry less informative power to differentiate between the classes or outputs in your data. Here's a breakdown of why features with low variance might be considered for removal:

1. **Lack of Discriminative Power**: A feature with low variance means that it has very similar values for most of the samples. If a feature is almost constant, it provides little information to the model because it does not vary much between different instances. For instance, if a feature takes the same value in 99% of the samples, it's unlikely to help the model distinguish between different classes.

2. **Noise Reduction**: Features with very low variance might be mostly noise, and by removing them, you can reduce the noise level in your dataset, allowing the model to focus on more significant features.

3. **Computational Efficiency**: Removing features with low variance can reduce the dimensionality of the dataset, leading to faster training times and potentially less complex models. This is especially beneficial when dealing with large datasets.

4. **Overfitting Prevention**: High-dimensional data with many features relative to the number of samples can lead to overfitting, where the model learns the noise in the training data instead of the actual patterns. Reducing the number of features can help mitigate this risk.

5. **Model Interpretability**: Models with fewer features are generally easier to interpret and understand. By eliminating low-variance features, you can simplify the model, making it easier to analyze and explain.

However, it's important to apply this technique judiciously:

- **Context Matters**: Always consider the specific context and domain knowledge. Some low-variance features might still be important for prediction if they have a significant impact on the output variable in specific cases.
- **Threshold Selection**: The choice of the variance threshold is subjective and should be informed by domain knowledge and exploratory data analysis. A threshold that's too high might eliminate important features, while a threshold that's too low might retain too much noise.
- **Complementary Techniques**: Feature selection based on variance should be complemented with other feature selection techniques, such as univariate statistical tests, recursive feature elimination, or feature importance from models.

In summary, while dropping low-variance features can be beneficial in many scenarios, it's not a one-size-fits-all solution and should be considered as part of a broader strategy of feature engineering and selection.

In [None]:
# Convert target to binary and split data

train_data, test_data = train_test_split(data, test_size=0.2, random_state=123)

# Separate features and target variable
X_train = train_data.drop('PotentialFraud', axis=1)
y_train = train_data['PotentialFraud']
X_test = test_data.drop('PotentialFraud', axis=1)
y_test = test_data['PotentialFraud']


In [None]:
print(y_train.unique())


[1 0]


Define the Hypermodel

In [None]:
from keras_tuner import HyperModel
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

class MyHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = Sequential()
        model.add(Dense(units=hp.Int('units_input', 64, 256, step=32), 
                        activation='relu', 
                        input_shape=self.input_shape))
        model.add(BatchNormalization())
        model.add(Dropout(hp.Float('dropout_input', 0.0, 0.5)))

        for i in range(hp.Int('n_layers', 1, 3)):
            model.add(Dense(units=hp.Int(f'units_{i}', 32, 128, step=32),
                            activation=hp.Choice(f'activation_{i}', ['relu', 'elu', 'selu']),
                            kernel_regularizer=regularizers.l2(hp.Float(f'regularizer_{i}', 1e-5, 1e-3, sampling='log'))))
            model.add(BatchNormalization())
            model.add(Dropout(hp.Float(f'dropout_{i}', 0.0, 0.5)))

        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

# Instantiate the hypermodel
hypermodel = MyHyperModel(input_shape=(X_train.shape[1],))


Hyperparameter Tuning with Keras Tuner

In [None]:
from kerastuner.tuners import RandomSearch

# Initialize the Random Search tuner
tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='my_dir',
    project_name='keras_tuner_demo'
)

# Perform hyperparameter tuning
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)


Reloading Tuner from my_dir\keras_tuner_demo\tuner0.json


Trial 10 Complete [00h 14m 38s]

val_accuracy: 0.6420661807060242

Best val_accuracy So Far: 0.6638377010822296

Total elapsed time: 01h 52m 42s

Compile and Fit the Model with the Best Hyperparameters

In [None]:
# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the best hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Compile the best model with a simpler optimizer if needed
best_model.compile(
    optimizer='sgd',  # Simpler optimizer
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Define the early stopping callback with increased patience
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Fit the best model with early stopping, smaller number of epochs, and potentially smaller batch size
best_model.fit(
    X_train, y_train, 
    epochs=10,  # Fewer epochs
    batch_size=32,  # Smaller batch size
    validation_split=0.1,  # Smaller validation split
    callbacks=[early_stopping]
)

# Evaluate the best model
eval_result = best_model.evaluate(X_test, y_test)
print("[test loss, test accuracy]:", eval_result)


# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the best hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Compile the best model
best_model.compile(
    optimizer=Adam(learning_rate=best_hps.get('learning_rate')),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Define the early stopping callback
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Fit the best model with early stopping
best_model.fit(
    X_train, y_train, 
    epochs=50, 
    validation_split=0.2, 
    callbacks=[early_stopping]
)

# Evaluate the best model
eval_result = best_model.evaluate(X_test, y_test)
print("[test loss, test accuracy]:", eval_result)
