### **Lab Title:** Hyperparameter Tuning and Data Splitting in Deep Neural Networks

#### **Objective:**
1. Learn how to build and train a Deep Neural Network (DNN) using Keras.
2. Practice adjusting key hyperparameters, including learning rate, batch size, and epochs.
3. Compare model performance between using a simple train-test split and x-fold cross-validation.

#### **Prerequisites:**
- Basic knowledge of Python and neural networks.
- Understanding of Keras syntax and usage.
- Familiarity with key concepts: hyperparameters, train-test split, and cross-validation.

#### **Dataset:**
Use the provided Telco customer dataset (`Simulated_Telco_Data_with_StreamingMovies_Label.csv`). This dataset includes customer information and a target variable related to streaming movie subscriptions, which students will use to predict customer behavior.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Load and preprocess data
data = pd.read_csv('https://raw.githubusercontent.com/9meo/bas240/refs/heads/main/MTS484/Simulated_Telco_Data_with_StreamingMovies_Label.csv')

# Convert 'TotalCharges' to numeric, coercing errors for any invalid entries
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

# Drop irrelevant columns (e.g., 'customerID')
data = data.drop(columns=['customerID'])

# Encode categorical columns
label_encoders = {}
for column in data.select_dtypes(include='object').columns:
    if column != 'Label':
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

# Encode target variable
data['Label'] = data['Label'].map({'Yes': 1, 'No': 0})

# Separate features and target
X = data.drop(columns=['Label'])
y = data['Label']

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.4067 - loss: nan - val_accuracy: 0.3656 - val_loss: nan
Epoch 2/20
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3929 - loss: nan - val_accuracy: 0.3656 - val_loss: nan
Epoch 3/20
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3944 - loss: nan - val_accuracy: 0.3656 - val_loss: nan
Epoch 4/20
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3942 - loss: nan - val_accuracy: 0.3656 - val_loss: nan
Epoch 5/20
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4041 - loss: nan - val_accuracy: 0.3656 - val_loss: nan
Epoch 6/20
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4094 - loss: nan - val_accuracy: 0.3656 - val_loss: nan
Epoch 7/20
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Accuracy

# Load and preprocess data
data =  pd.read_csv('https://raw.githubusercontent.com/9meo/bas240/refs/heads/main/MTS484/Simulated_Telco_Data_with_StreamingMovies_Label.csv')

# Convert 'TotalCharges' to numeric, handling errors
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

# Drop irrelevant columns
data = data.drop(columns=['customerID'])

# Encode categorical columns
label_encoders = {}
for column in data.select_dtypes(include='object').columns:
    if column != 'Label':
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

# Encode target variable
data['Label'] = data['Label'].map({'Yes': 1, 'No': 0})

# Separate features and target
X = data.drop(columns=['Label'])
y = data['Label']

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Define the K-fold Cross Validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize variables to store results
fold_no = 1
accuracies = []

for train_index, test_index in kf.split(X):
    print(f'Processing fold #{fold_no}')

    # Split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Build the model
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer=Adam(), loss=BinaryCrossentropy(), metrics=[Accuracy()])

    # Train the model
    model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0)

    # Evaluate the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    accuracy = scores[1] * 100
    print(f'Fold #{fold_no} - Test Accuracy: {accuracy:.2f}%')
    accuracies.append(accuracy)

    fold_no += 1

# Calculate average accuracy
average_accuracy = np.mean(accuracies)
print(f'Average Test Accuracy: {average_accuracy:.2f}%')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Processing fold #1
Fold #1 - Test Accuracy: 0.00%
Processing fold #2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold #2 - Test Accuracy: 0.00%
Processing fold #3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold #3 - Test Accuracy: 0.00%
Processing fold #4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold #4 - Test Accuracy: 0.00%
Processing fold #5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold #5 - Test Accuracy: 0.00%
Average Test Accuracy: 0.00%
