# Customer Churn Prediction

## Objective

The objective is to develop a machine learning model to predict customer churn based on historical customer data. You 
will follow a typical machine learning project pipeline, from data preprocessing to model deployment

### Data Preprocessing

In [4]:
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [5]:
df  = pd.read_csv("customer_churn_large_dataset.csv")
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [6]:
df.dtypes

CustomerID                      int64
Name                           object
Age                             int64
Gender                         object
Location                       object
Subscription_Length_Months      int64
Monthly_Bill                  float64
Total_Usage_GB                  int64
Churn                           int64
dtype: object

In [7]:
df.describe()

Unnamed: 0,CustomerID,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50000.5,44.02702,12.4901,65.053197,274.39365,0.49779
std,28867.657797,15.280283,6.926461,20.230696,130.463063,0.499998
min,1.0,18.0,1.0,30.0,50.0,0.0
25%,25000.75,31.0,6.0,47.54,161.0,0.0
50%,50000.5,44.0,12.0,65.01,274.0,0.0
75%,75000.25,57.0,19.0,82.64,387.0,1.0
max,100000.0,70.0,24.0,100.0,500.0,1.0


In [8]:
df.isnull().sum()

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

It seems there are no missing data in the dataset

In [9]:
# Handling outliers
z_sub = stats.zscore(df['Subscription_Length_Months'])
df = df[(z_sub < 3) & (z_sub > -3)]

z_bill = stats.zscore(df['Monthly_Bill'])
df = df[(z_bill < 3) & (z_bill > -3)]

z_usage = stats.zscore(df['Total_Usage_GB'])
df = df[(z_usage < 3) & (z_usage > -3)]

z_churn = stats.zscore(df['Churn'])
df = df[(z_churn < 3) & (z_churn > -3)]


In [10]:
df = pd.get_dummies(df, columns=['Gender', 'Location'], drop_first=True)

In [11]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Engineering

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV


In [14]:
data = pd.read_csv("customer_churn_large_dataset.csv")

In [15]:
# Handle missing data (if needed)
data.dropna(inplace=True)

# Encode categorical variables (One-Hot Encoding)
encoder = OneHotEncoder(sparse=False, drop='first')
categorical_cols = ['Gender', 'Location']
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
data_encoded['CustomerID'] = data_encoded['CustomerID'].astype('int64')
data_encoded = data_encoded.drop(['CustomerID', 'Name'], axis=1)


In [16]:
data.dtypes

CustomerID                      int64
Name                           object
Age                             int64
Gender                         object
Location                       object
Subscription_Length_Months      int64
Monthly_Bill                  float64
Total_Usage_GB                  int64
Churn                           int64
dtype: object

In [17]:
data_encoded.dtypes

Age                             int64
Subscription_Length_Months      int64
Monthly_Bill                  float64
Total_Usage_GB                  int64
Churn                           int64
Gender_Male                      bool
Location_Houston                 bool
Location_Los Angeles             bool
Location_Miami                   bool
Location_New York                bool
dtype: object

In [18]:
X = data_encoded.drop("Churn", axis=1)
y = data_encoded["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Model Building
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [21]:
rf_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}


In [None]:
rs_model = RandomizedSearchCV(RandomForestClassifier(),
                              param_distributions=rf_grid,
                              n_iter=100,
                              cv=10,
                              verbose=True)
rs_model.fit(X_train_scaled, y_train)

In [None]:
best_params = rs_model.best_params_
best_model = rs_model.best_estimator_

In [None]:
y_pred = best_model.predict(X_test_scaled)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:

# Display performance metrics
print("Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Model Evaluation Metrics:
Accuracy: 0.49555
Precision: 0.4912023460410557
Recall: 0.4727346033665961
F1 Score: 0.4817915660794083


## Neural Network Approach

In [23]:
import tensorflow as tf
from tensorflow import keras

In [24]:
# Define the model architecture
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_dim=X_train_scaled.shape[1]),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation='sigmoid')
])


In [25]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [26]:
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f6ad40fce80>

In [27]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print("Test accuracy:", accuracy)

Test accuracy: 0.5055000185966492
