In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, BalancedBaggingClassifier
from imblearn.over_sampling import SMOTE


In [24]:
# Load the dataset
data = pd.read_csv("D:\MSIS\Customer-Churn-Prediction---Using-TensorFlow\Data\WA_Fn-UseC_-Telco-Customer-Churn.csv")


In [25]:
# Data preprocessing
data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')
data.drop(labels=data[data['tenure'] == 0].index, axis=0, inplace=True)
data.fillna(data["TotalCharges"].mean(), inplace=True)
data = data.drop(['customerID'], axis=1)


In [26]:

# Transforming object to int using Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
def object_to_int(dataframe_series):
    if dataframe_series.dtype == 'object':
        dataframe_series = encoder.fit_transform(dataframe_series)
    return dataframe_series

data = data.apply(lambda x: object_to_int(x))


In [27]:
# Data splitting
X = data.drop('Churn', axis=1)
y = data['Churn'].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [28]:
# Handle imbalance using SMOTE (oversampling the minority class)
'''smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)'''


'smote = SMOTE(random_state=42)\nX_train, y_train = smote.fit_resample(X_train, y_train)'

In [29]:
'''from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)'''

'from imblearn.under_sampling import RandomUnderSampler\nrus = RandomUnderSampler(random_state=42)\nX_train, y_train = rus.fit_resample(X_train, y_train)'

In [30]:

# Define the models
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Support Vector Machine': SVC(class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]), random_state=42),
    'Balanced Random Forest': BalancedRandomForestClassifier(random_state=42),
    'Easy Ensemble': EasyEnsembleClassifier(random_state=42),
    'Balanced Bagging': BalancedBaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42)
}

# Initialize lists to store results
results = []

# Evaluate each model
for model_name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Store results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Confusion Matrix': cm
    })

# Create a DataFrame to compare the models
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

                    Model  Accuracy  F1 Score           Confusion Matrix
0     Logistic Regression  0.735545  0.614108  [[1108, 441], [117, 444]]
1           Decision Tree  0.739336  0.514134  [[1269, 280], [270, 291]]
2           Random Forest  0.781043  0.529532  [[1388, 161], [301, 260]]
3  Support Vector Machine  0.727014  0.601108  [[1100, 449], [127, 434]]
4       Gradient Boosting  0.794787  0.566567  [[1394, 155], [278, 283]]
5                 XGBoost  0.759242  0.589661  [[1237, 312], [196, 365]]
6  Balanced Random Forest  0.755450  0.600000  [[1207, 342], [174, 387]]
7           Easy Ensemble  0.731280  0.615071  [[1090, 459], [108, 453]]
8        Balanced Bagging  0.757346  0.576860  [[1249, 300], [212, 349]]


In [None]:
import tensorflow as tf
from tensorflow import keras


model = keras.Sequential([
    keras.layers.Dense(19, input_shape=(19,), activation='relu'),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# opt = keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100)

In [31]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score
import keras_tuner
from keras_tuner.tuners import RandomSearch
import matplotlib.pyplot as plt

# Page Configuration
st.set_page_config(
    page_title="Customer Churn Prediction",
    page_icon="📊",
    layout="wide",
)

# Load and preprocess data
@st.cache_data
def load_data():
    data_path = "Data/WA_Fn-UseC_-Telco-Customer-Churn.csv"
    data = pd.read_csv(data_path)
    data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
    data.dropna(subset=['TotalCharges'], inplace=True)
    data["SeniorCitizen"] = data["SeniorCitizen"].map({0: "No", 1: "Yes"})
    data = data.drop(['customerID'], axis=1)

    # Encode categorical variables
    categorical_cols = data.select_dtypes(include=['object']).columns
    data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
    return data

data = load_data()

# Split data
X = data.drop('Churn_Yes', axis=1)
y = data['Churn_Yes']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning with Keras Tuner
def build_model(hp):
    model = keras.Sequential()
    hp_units1 = hp.Int('units1', min_value=10, max_value=50, step=10)
    hp_units2 = hp.Int('units2', min_value=10, max_value=50, step=10)
    model.add(layers.Dense(units=hp_units1, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(layers.Dense(units=hp_units2, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    # Tune the learning rate for the optimizer
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Initialize the tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='my_dir',
    project_name='tuning_example'
)

# Run the hyperparameter search
tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Train the best model
best_model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), verbose=1)

# Streamlit app
st.title("Customer Churn Prediction")

# User input form
st.sidebar.header("Input Customer Details")
def user_input_features():
    features = {}
    for col in X.columns:
        if data[col].max() <= 1:  # Binary or normalized column
            features[col] = st.sidebar.slider(col, 0.0, 1.0, 0.5)
        else:
            features[col] = st.sidebar.number_input(col, float(data[col].min()), float(data[col].max()), float(data[col].mean()))
    return pd.DataFrame(features, index=[0])

input_df = user_input_features()

# Preprocess user input
input_scaled = scaler.transform(input_df)

# Make prediction
prediction_proba = best_model.predict(input_scaled)[0][0]
prediction = "Yes" if prediction_proba > 0.5 else "No"

# Display prediction
st.subheader("Prediction")
st.write(f"The customer will churn: **{prediction}**")

st.subheader("Prediction Probability")
st.write(f"Probability of churn: **{prediction_proba:.2f}**")

# Developer details on the right
st.markdown("""
    <style>
    .developer-info {
        position: fixed;
        top: 10px;
        right: 10px;
        font-size: 18px;
        font-weight: bold;
        color: #333;
    }
    </style>
    <div class="developer-info">
        **Developer:** Akshat Maurya<br>
        [GitHub](https://github.com/akshatm13) | [LinkedIn](https://www.linkedin.com/in/makshat13)
    </div>
""", unsafe_allow_html=True)

# Display model performance
st.subheader("Model Performance on Test Set")

# Predict on the test set
y_pred = best_model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# Evaluate the best model
train_results = best_model.evaluate(X_train, y_train)
val_results = best_model.evaluate(X_test, y_test)
st.write(f"Training Accuracy: {train_results[1]*100:.2f}%")
st.write(f"Validation Accuracy: {val_results[1]*100:.2f}%")


Trial 5 Complete [00h 00m 24s]
val_accuracy: 0.7842021981875101

Best val_accuracy So Far: 0.790837307771047
Total elapsed time: 00h 01m 42s


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7913 - loss: 0.4363
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7927 - loss: 0.4581
Model performance for Training set
- Accuracy: 0.7992686033248901
- Loss: 0.4305054247379303
----------------------------------
Model performance for Validation set
- Accuracy: 0.7962085604667664
- Loss: 0.4447570741176605
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Confusion Matrix:
 [[1364  185]
 [ 245  316]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.86      1549
           1       0.63      0.56      0.60       561

    accuracy                           0.80      2110
   macro avg       0.74      0.72      0.73      2110
weighted avg       0.79      0.80      0.79      2110

