The Dataset contains the following 12 features:

CustomerID: A unique identifier

Age: The age of the customer

Gender: The gender of the customer

Tenure: The number of months the customer has stayed with the company

Usage Frequency: The number of times the customer has used the service the past month

Support calls: The number of support calls the customer has made the past month

Payment Delay: Number of days the customer has delayed payment the past month

Subscription Type: The type of subscription the customer has

Contract Length: Duration of the contract

Total Spend: The total amount the customer has spent

Last Interaction: Number of days since the last interaction the customer has had with the company

Churn: Whether the customer has churned or not

# Import packages

In [29]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm

# Cleanup

In [None]:
test_set_dirty = pd.read_csv("Datasets/In/customer_churn_dataset-testing-master.csv", sep=",")
training_set_dirty = pd.read_csv("Datasets/In/customer_churn_dataset-training-master.csv", sep=",")

combined_set_dirty = pd.concat([training_set_dirty, test_set_dirty], ignore_index=True)
combined_set_dirty = combined_set_dirty.drop(combined_set_dirty.columns[0], axis=1)


In [None]:
missing_values = combined_set_dirty.isnull().sum()
missing_values

In [None]:
combined_set_dirty[combined_set_dirty.isna().any(axis=1)]

In [None]:
# remove row with missing values
combined_set_dirty.dropna(inplace=True)

combined_set_dirty.columns = [col.lower().replace(" ", "_") for col in combined_set_dirty.columns]
combined_set_dirty.info()

In [None]:
combined_set_dirty[combined_set_dirty.isna().any(axis=1)]

In [None]:
numerals = ["age", "tenure", "usage_frequency", "support_calls", "payment_delay", "last_interaction", "churn"]

for col in numerals:
    combined_set_dirty[col] = combined_set_dirty[col].astype(int)
    

In [None]:
# # Create a label encoder object
# le = LabelEncoder()
# 
# # List of columns you want to convert
# columns_to_convert = ['gender', 'subscription_type', 'contract_length']
# 
# # Apply the label encoder to each column and print the mapping
# for column in columns_to_convert:
#     combined_set_dirty[column] = le.fit_transform(combined_set_dirty[column])
#     print(f"Mapping for {column}:")
#     for class_, label in zip(le.classes_, range(len(le.classes_))):
#         print(f"{class_} -> {label}")
#     print("\n")

In [None]:
cleaned_set = combined_set_dirty.copy()

Descriptive Analytics

In [None]:
# Summary statistics
print("Summary Statistics for Churned Customers:")
print(cleaned_set[cleaned_set['churn'] == 1].describe())
print("\nSummary Statistics for Non-Churned Customers:")
print(cleaned_set[cleaned_set['churn'] == 0].describe())

# Distribution of categorical variables
categorical_columns = ['gender', 'subscription_type', 'contract_length']
for column in categorical_columns:
    plt.figure(figsize=(10,5))
    sns.countplot(data=cleaned_set, x=column, hue='churn')
    plt.title(f'Distribution of {column} for Churned and Non-Churned Customers')
    plt.show()

# Correlation analysis
correlation = cleaned_set.corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Churn rate
churn_rate = cleaned_set['churn'].mean() * 100
print(f"Churn Rate: {churn_rate}%")

In [None]:
numeric_cols = ["age", "tenure", "usage_frequency", "support_calls", "payment_delay", "last_interaction", "total_spend"]

num_bins = 3

excourse_set = cleaned_set.copy()

for col in numeric_cols:
    excourse_set[col] = pd.cut(cleaned_set[col], num_bins, duplicates='drop')
    print(col)
    for interval in excourse_set[col].cat.categories:
        print(interval)

In [None]:
columns = ["age", "gender", "tenure", "usage_frequency", "support_calls", "payment_delay", "subscription_type", "contract_length", "total_spend", "last_interaction"]

stacked_data_percent = {}

for col in columns:
    category_counts = excourse_set.groupby([col, "churn"]).size().unstack(fill_value=0)
    
    category_percent = category_counts.div(category_counts.sum(axis=1), axis=0) * 100
    print(category_percent)
    stacked_data_percent[col] = category_percent
    

In [None]:
# Calculate the overall churn rate
overall_churn_rate = excourse_set['churn'].mean() * 100

# Add a new row to each DataFrame in stacked_data_percent with the overall churn rate
for col, df in stacked_data_percent.items():
    df.loc['Overall'] = [100 - overall_churn_rate, overall_churn_rate]
    
colors = {0: 'green', 1: 'red'}
for col, df in stacked_data_percent.items():
    ax = df.plot(kind='barh', stacked=True, color=[colors[churn] for churn in df.columns],
                 title=f'Percentage Chart of Churned Customers in {col}')
    plt.ylabel(col)
    plt.xlabel('Percentage')
    plt.legend(["No Churn", "Churn"], loc='best')

    # Add the percentage values on each bar
    for p in ax.patches:
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy()
        ax.text(x+width/2,
                y+height/2,
                '{:.1f} %'.format(width),
                horizontalalignment='center',
                verticalalignment='center')
    plt.show()

In [None]:
combinations = list(itertools.combinations(columns, 2))

# Analyze each combination
for combination in combinations:
    # Create a multi-index DataFrame
    multi_index_df = excourse_set.set_index(list(combination) + ['churn'])

    # Calculate the size of each group
    grouped_df = multi_index_df.groupby(list(combination) + ['churn']).size()

    # Unstack the DataFrame to get a cross-tabulation
    cross_tab = grouped_df.unstack(fill_value=0)

    # Convert absolute numbers to relative percentages
    cross_tab_percent = cross_tab.div(cross_tab.sum(axis=1), axis=0) * 100

    # Print the cross-tabulation
    print(f"Cross-tabulation for {combination}:")
    print(cross_tab_percent)
    print("\n")

In [None]:
combinations = list(itertools.combinations(columns, 3))

# Analyze each combination
for combination in combinations:
    # Create a multi-index DataFrame
    multi_index_df = excourse_set.set_index(list(combination) + ['churn'])

    # Calculate the size of each group
    grouped_df = multi_index_df.groupby(list(combination) + ['churn']).size()

    # Unstack the DataFrame to get a cross-tabulation
    cross_tab = grouped_df.unstack(fill_value=0)

    # Convert absolute numbers to relative percentages
    cross_tab_percent = cross_tab.div(cross_tab.sum(axis=1), axis=0) * 100

    # Print the cross-tabulation
    print(f"Cross-tabulation for {combination}:")
    print(cross_tab_percent)
    print("\n")

# Predictive Analytics
## Primitive Approach

In [None]:
prepared_set = cleaned_set.copy()

# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse_output=False)

# Identify categorical columns
categorical_cols = ['gender', 'subscription_type', 'contract_length']

# Fit and transform the data, converting it into a DataFrame
prepared_set_encoded = pd.DataFrame(encoder.fit_transform(prepared_set[categorical_cols]))

# Get feature names from the encoder and assign them as column names
prepared_set_encoded.columns = encoder.get_feature_names_out(categorical_cols)

# Drop the original categorical columns
prepared_set.drop(categorical_cols, axis=1, inplace=True)

# Reset the indices of the dataframes
prepared_set = prepared_set.reset_index(drop=True)
prepared_set_encoded = prepared_set_encoded.reset_index(drop=True)

# Concatenate the original DataFrame with the one-hot encoded DataFrame
prepared_set = pd.concat([prepared_set, prepared_set_encoded], axis=1)

prepared_set.info()

In [None]:
primitive_set = prepared_set.copy()

X = primitive_set.drop('churn', axis=1)
y = primitive_set['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [None]:
# Calculate the performance metrics

# Predict the probabilities of the positive class
y_pred_proba = model.predict_proba(X_test)[:, 1]


primitive_mse = mean_squared_error(y_test, y_pred)
primitive_mae = mean_absolute_error(y_test, y_pred)
primitive_r2 = r2_score(y_test, y_pred)
primitive_accuracy = accuracy_score(y_test, y_pred)
primitive_precision = precision_score(y_test, y_pred)
primitive_recall = recall_score(y_test, y_pred)
primitive_f1 = f1_score(y_test, y_pred)
primitive_roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"MSE: {primitive_mse}\nMAE: {primitive_mae}\nR2: {primitive_r2}\nAccuracy: {primitive_accuracy}\nPrecision: {primitive_precision}\nRecall: {primitive_recall}\nF1 Score: {primitive_f1}\nROC AUC: {primitive_roc_auc}")

# My Model

### Define Target and Feauture Variables & Split and Scale Set
Split the data into training, validation, and test sets, then, standardise the features

In [None]:
better_model = prepared_set.copy()

# Define Target and feature variables
y = better_model['churn'].values
X = better_model.drop(['churn'], axis=1)

# Extract feature names
feature_names = X.columns.tolist()

# Perform train-validation-test split
X_train_val, X_test, y_train_val, y_test = train_test_split(X[feature_names], y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.285, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_train_val = scaler.transform(X_train_val)
X_test = scaler.transform(X_test)

### Train and Evaluate Models
Perform hyperparameter tuning for the LightGBM using grid search. Print the best hyperparameters and the corresponding R-squared(on subset of training set(Cross-Validation)) score.

In [None]:
# Define the parameter lgbm_grid
param_grid = {
    'max_depth': [10, 20, 30],
    'learning_rate': [0.01, 0.1, 1],
    'num_leaves': [31, 62, 93],
    'n_estimators': [100, 200, 300]
}

# Create a LightGBM model
lgbm_model = lgb.LGBMRegressor()

# Create the lgbm_grid search object
lgbm_grid = GridSearchCV(lgbm_model, param_grid, cv=5, scoring='r2')

# Fit the lgbm_grid search object to the data
lgbm_grid.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print(lgbm_grid.best_params_)
print(lgbm_grid.best_score_)

Perform hyperparameter tuning for the Decision Tree model using grid search. Print the best hyperparameters and the corresponding R-squared(on subset of training set(Cross-Validation)) score

In [None]:
# Define the parameter tree_grid
param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [1.0, 'sqrt', 'log2', None]
}

# Create a DecisionTreeRegressor model
decTree_model = DecisionTreeRegressor()

#Create the tree_grid search object
tree_grid = GridSearchCV(decTree_model, param_grid, cv=5, scoring="r2")

# fit the tree_grid search object to the data
tree_grid.fit(X_train, y_train)

#Print the best parameters and the corresponding score
print(tree_grid.best_params_)
print(tree_grid.best_score_)

In [28]:
# Define the parameter xgb_grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 6, 10],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.4, 0.7, 1.0]
}
print("check 1")
# Create model
xgb_model = xgb.XGBRegressor()
print("check 2")

# Create grid search object
xgb_grid = GridSearchCV(xgb_model, param_grid, cv=5, scoring='r2')
print("check 3")

# Fit the xgb_grid search object to the data
xgb_grid.fit(X_train, y_train)
print("check 4")
#Print the best  parameters and the corresponding score
print(xgb_grid.best_params_)
print(xgb_grid.best_score_)
print("check 5")

importance_scores = xgb_grid.feature_importances_
print("check 6")

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_scores
})
print("check 7")

importance_df = importance_df.sort_values('Importance', ascending=False)
print("check 8")

importance_df.plot(kind='bar', x='Feature', y='Importance', title='Feature Importance', figsize=(15, 6))
plt.ylabel('Importance Score')
plt.show()

KeyboardInterrupt: 

In [None]:
# Define the parameter xgb_random
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 6, 10],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.4, 0.7, 1.0]
}

print('Check 1')

# create model
xgb_model = xgb.XGBRegressor()
print('check 2')

# Create randomised Search object
xgb_random = RandomizedSearchCV(xgb_model, param_grid,n_iter= 50, cv=5, scoring='r2', random_state=42)
print('check 3')

# Fit the xgb_random search object to the data
xgb_random.fit(X_train, y_train)
print('check 4')

# Print the best parameters and the corresponding score
print(xgb_random.best_params_)
print(xgb_random.best_score_)
print("check 5")

# Get the best model
xgb_random_best_model = xgb_random.best_estimator_

# Calculate feature importance
importance_scores = xgb_random_best_model.feature_importances_
print("check 6")

# Create a dataframe for feature importance
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_scores
})
print("check 7")

# Sort the dataframe by importance
importance_df = importance_df.sort_values('Importance', ascending=False)
print("check 8")

# Plot the feature importance
importance_df.plot(kind='bar', x='Feature', y='Importance', title='Feature Importance', figsize=(15, 6))
plt.ylabel('Importance Score')
plt.show()

Check 1
check 2
check 3


In [None]:
y_pred_tree = tree_grid.best_estimator_.predict(X_val)
y_pred_lgbm = lgbm_grid.best_estimator_.predict(X_val)
y_pred_xgb = xgb_grid.best_estimator_.predict(X_val)

mse_tree = mean_squared_error(y_val, y_pred_tree)
mae_tree = mean_absolute_error(y_val, y_pred_tree)
r2_tree = r2_score(y_val, y_pred_tree)
mse_lgbm = mean_squared_error(y_val, y_pred_lgbm)
mae_lgbm = mean_absolute_error(y_val, y_pred_lgbm)
r2_lgbm = r2_score(y_val, y_pred_lgbm)
mse_xgb = mean_squared_error(y_val, y_pred_xgb)
mae_xgb = mean_absolute_error(y_val, y_pred_xgb)
r2_xgb = r2_score(y_val, y_pred_xgb)

models = ['Decision Tree', 'LightGBM', 'XGBoost']
mse_values = [mse_tree, mse_lgbm, mse_xgb]
mae_values = [mae_tree, mae_lgbm, mae_xgb]
r2_values = [r2_tree, r2_lgbm, r2_xgb]

# Function to print the values
def print_values(values, models, title):
    print(title)
    for model, value in zip(models, values):
        print(f"{model}: {value}")

# Function to plot the values
def plot_values(values, title, models, ylabel):
    plt.figure(figsize=(15, 6))
    sns.barplot(x=models, y=values)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.show()

# Print and plot MSE values
print_values(mse_values, models, 'Comparison of Mean Squared Error')
print_values(r2_values, models, 'Comparison of R2 Score')
print_values(mae_values, models, 'Comparison of Mean Absolute Error')
plot_values(mse_values, 'Comparison of Mean Squared Error', models, 'MSE')

# Print and plot MAE values

plot_values(mae_values, 'Comparison of Mean Absolute Error', models, 'MAE')

# Print and plot R2 values

plot_values(r2_values, 'Comparison of R2 Score', models, 'R2 Score')