In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, BalancedBaggingClassifier
from imblearn.over_sampling import SMOTE


In [40]:
# Load the dataset
data = pd.read_csv("D:\MSIS\Customer-Churn-Prediction---Using-TensorFlow\Data\WA_Fn-UseC_-Telco-Customer-Churn.csv")


In [41]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [42]:
# Data preprocessing
data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')
data.drop(labels=data[data['tenure'] == 0].index, axis=0, inplace=True)
data.fillna(data["TotalCharges"].mean(), inplace=True)
data = data.drop(['customerID'], axis=1)


In [43]:

# Transforming object to int using Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
def object_to_int(dataframe_series):
    if dataframe_series.dtype == 'object':
        dataframe_series = encoder.fit_transform(dataframe_series)
    return dataframe_series

data = data.apply(lambda x: object_to_int(x))


In [44]:
# Data splitting
X = data.drop('Churn', axis=1)
y = data['Churn'].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [45]:
# Handle imbalance using SMOTE (oversampling the minority class)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)


In [46]:
%pip install imbalanced-learn

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:

# Define the models
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier( random_state=42),
    'Support Vector Machine': SVC(class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(),
    'Balanced Random Forest': BalancedRandomForestClassifier(random_state=42),
    'Easy Ensemble': EasyEnsembleClassifier(random_state=42),
    'Balanced Bagging': BalancedBaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42)
}

# Initialize lists to store results
results = []

# Evaluate each model
for model_name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Store results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Confusion Matrix': cm
    })

# Create a DataFrame to compare the models
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

                    Model  Accuracy  F1 Score           Confusion Matrix
0     Logistic Regression  0.745498  0.602517  [[1166, 383], [154, 407]]
1           Decision Tree  0.713270  0.480687  [[1225, 324], [281, 280]]
2           Random Forest  0.763981  0.557726  [[1298, 251], [247, 314]]
3  Support Vector Machine  0.763507  0.592653  [[1248, 301], [198, 363]]
4       Gradient Boosting  0.767773  0.612342  [[1233, 316], [174, 387]]
5                 XGBoost  0.759716  0.551724  [[1291, 258], [249, 312]]
6  Balanced Random Forest  0.767773  0.567138  [[1299, 250], [240, 321]]
7           Easy Ensemble  0.737441  0.599711  [[1141, 408], [146, 415]]
8        Balanced Bagging  0.748815  0.524237  [[1288, 261], [269, 292]]


In [48]:

# Define the models
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Support Vector Machine': SVC(class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]), random_state=42),
    'Balanced Random Forest': BalancedRandomForestClassifier(random_state=42),
    'Easy Ensemble': EasyEnsembleClassifier(random_state=42),
    'Balanced Bagging': BalancedBaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42)
}

# Initialize lists to store results
results = []

# Evaluate each model
for model_name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Store results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Confusion Matrix': cm
    })

# Create a DataFrame to compare the models
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

                    Model  Accuracy  F1 Score           Confusion Matrix
0     Logistic Regression  0.745498  0.602517  [[1166, 383], [154, 407]]
1           Decision Tree  0.713270  0.480687  [[1225, 324], [281, 280]]
2           Random Forest  0.763981  0.557726  [[1298, 251], [247, 314]]
3  Support Vector Machine  0.763507  0.592653  [[1248, 301], [198, 363]]
4       Gradient Boosting  0.767773  0.612342  [[1233, 316], [174, 387]]
5                 XGBoost  0.759716  0.551724  [[1291, 258], [249, 312]]
6  Balanced Random Forest  0.767773  0.567138  [[1299, 250], [240, 321]]
7           Easy Ensemble  0.737441  0.599711  [[1141, 408], [146, 415]]
8        Balanced Bagging  0.748815  0.524237  [[1288, 261], [269, 292]]


In [49]:
import tensorflow as tf
from tensorflow import keras


model = keras.Sequential([
    keras.layers.Dense(19, input_shape=(19,), activation='relu'),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# opt = keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6553 - loss: 4.0261
Epoch 2/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7096 - loss: 0.8748
Epoch 3/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 922us/step - accuracy: 0.7296 - loss: 0.7445
Epoch 4/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7371 - loss: 0.9592
Epoch 5/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7518 - loss: 0.6510
Epoch 6/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7638 - loss: 0.5696
Epoch 7/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 885us/step - accuracy: 0.7684 - loss: 0.5276
Epoch 8/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7506 - loss: 0.6022
Epoch 9/100
[1m226/226[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x2b7b0d4cb10>

In [50]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Generate predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred_classes)
print(f"F1 Score: {f1:.2f}")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_classes))

[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7568 - loss: 0.5033  
Test Accuracy: 75.50%
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
F1 Score: 0.59
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.78      0.82      1549
           1       0.53      0.68      0.59       561

    accuracy                           0.75      2110
   macro avg       0.70      0.73      0.71      2110
weighted avg       0.78      0.75      0.76      2110



In [51]:

# Hyperparameter tuning with Keras Tuner
def build_model(hp):
    model = keras.Sequential()
    hp_units1 = hp.Int('units1', min_value=10, max_value=50, step=10)
    hp_units2 = hp.Int('units2', min_value=10, max_value=50, step=10)
    model.add(layers.Dense(units=hp_units1, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(layers.Dense(units=hp_units2, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    # Tune the learning rate for the optimizer
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Initialize the tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='my_dir',
    project_name='tuning_example'
)

# Run the hyperparameter search
tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Train the best model
best_model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), verbose=1)

# Predict on the test set
y_pred = best_model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# Evaluate the best model
train_results = best_model.evaluate(X_train, y_train)
val_results = best_model.evaluate(X_test, y_test)
print(f"Training Accuracy: {train_results[1]*100:.2f}%")
print(f"Validation Accuracy: {val_results[1]*100:.2f}%")



Reloading Tuner from my_dir\tuning_example\tuner0.json
Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7346 - loss: 0.8956 - val_accuracy: 0.7701 - val_loss: 0.4603
Epoch 2/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7357 - loss: 0.5544 - val_accuracy: 0.7569 - val_loss: 0.5415
Epoch 3/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7605 - loss: 0.5051 - val_accuracy: 0.7313 - val_loss: 0.4988
Epoch 4/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7564 - loss: 0.5101 - val_accuracy: 0.7517 - val_loss: 0.5214
Epoch 5/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7690 - loss: 0.4717 - val_accuracy: 0.7479 - val_loss: 0.5485
Epoch 6/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7791 - loss: 0.4695 - val_accuracy: 0.7744 - val_loss: 0.4574
Epoch 7/20
[1m226/226[0m [32m━━━━━━━

In [52]:
# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred_classes)
cr = classification_report(y_test, y_pred_classes)

print("Confusion Matrix")
print(pd.DataFrame(cm, columns=['No Churn', 'Churn'], index=['No Churn', 'Churn']))

print("Classification Report")
print(cr)

Confusion Matrix
          No Churn  Churn
No Churn      1333    216
Churn          258    303
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1549
           1       0.58      0.54      0.56       561

    accuracy                           0.78      2110
   macro avg       0.71      0.70      0.71      2110
weighted avg       0.77      0.78      0.77      2110



In [None]:
import keras_tuner as kt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

def build_model(hp):
    model = keras.Sequential()
    
    # First hidden layer with tunable units, L2 regularization
    model.add(layers.Dense(
        units=hp.Int('units1', min_value=16, max_value=128, step=16),
        activation='relu',
        kernel_regularizer=regularizers.l2(hp.Choice('l2_reg1', values=[0.001, 0.01, 0.1]))
    ))
    
    # Dropout layer for regularization
    model.add(layers.Dropout(hp.Choice('dropout1', values=[0.1, 0.2, 0.3])))
    
    # Second hidden layer with tunable units, L2 regularization
    model.add(layers.Dense(
        units=hp.Int('units2', min_value=16, max_value=128, step=16),
        activation='relu',
        kernel_regularizer=regularizers.l2(hp.Choice('l2_reg2', values=[0.001, 0.01, 0.1]))
    ))
    
    model.add(layers.Dropout(hp.Choice('dropout2', values=[0.1, 0.2, 0.3])))
    
    # Output layer for binary classification
    model.add(layers.Dense(1, activation='sigmoid'))
    
    # Tune learning rate for Adam optimizer
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 5e-3, 1e-3, 5e-4, 1e-4])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Initialize the tuner with Bayesian Optimization
tuner = kt.BayesianOptimization(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # Increased search space
    executions_per_trial=3,
    directory='my_dir',
    project_name='tuning_optimized'
)

tuner.search_space_summary()
