In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier


In [2]:
path = r'Data & Resources/Data/Prod_Data.xlsx'

sheet_name = 'churndata'

data = pd.read_excel(path, sheet_name=sheet_name)

In [3]:
data.head()

Unnamed: 0,Customer_ID,Gender,Age,Married,State,Number_of_Referrals,Tenure_in_Months,Value_Deal,Phone_Service,Multiple_Lines,...,Payment_Method,Monthly_Charge,Total_Charges,Total_Refunds,Total_Extra_Data_Charges,Total_Long_Distance_Charges,Total_Revenue,Customer_Status,Churn_Category,Churn_Reason
0,19877-DEL,Male,35,No,Delhi,7,27,,Yes,No,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,Others,Others
1,58353-MAH,Female,45,Yes,Maharashtra,14,13,,Yes,Yes,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,Others,Others
2,25063-WES,Male,51,No,West Bengal,4,35,Deal 5,Yes,No,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,59787-KAR,Male,79,No,Karnataka,3,21,Deal 4,Yes,No,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,28544-TAM,Female,80,No,Tamil Nadu,3,8,,Yes,No,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [4]:
data.columns

Index(['Customer_ID', 'Gender', 'Age', 'Married', 'State',
       'Number_of_Referrals', 'Tenure_in_Months', 'Value_Deal',
       'Phone_Service', 'Multiple_Lines', 'Internet_Service', 'Internet_Type',
       'Online_Security', 'Online_Backup', 'Device_Protection_Plan',
       'Premium_Support', 'Streaming_TV', 'Streaming_Movies',
       'Streaming_Music', 'Unlimited_Data', 'Contract', 'Paperless_Billing',
       'Payment_Method', 'Monthly_Charge', 'Total_Charges', 'Total_Refunds',
       'Total_Extra_Data_Charges', 'Total_Long_Distance_Charges',
       'Total_Revenue', 'Customer_Status', 'Churn_Category', 'Churn_Reason'],
      dtype='object')

In [5]:
# Data Processing

# Drop unnecessary columns for prediction
data = data.drop(['Customer_ID', 'Churn_Category','Churn_Reason'], axis=1)

# List of columns to be label encoded
columns_to_encode = [
    'Gender', 'Married', 'State', 'Value_Deal', 'Phone_Service', 'Multiple_Lines', 
    'Internet_Service', 'Internet_Type', 'Online_Security', 'Online_Backup', 
    'Device_Protection_Plan','Premium_Support', 'Streaming_TV', 'Streaming_Movies',
    'Streaming_Music', 'Unlimited_Data', 'Contract', 'Paperless_Billing', 'Payment_Method'
]

encoder = LabelEncoder()

# Encode categorical variable
for col in columns_to_encode:
    data[col] = encoder.fit_transform(data[col])

# Manually encode the target variable
data['Customer_Status'] = data['Customer_Status'].map({'Stayed': 0, 'Churned': 1})

In [6]:
# Split data into features and targets
X = data.drop('Customer_Status', axis=1)
y = data['Customer_Status']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Initialize model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train model
rf_model.fit(X_train, y_train)

In [8]:
# Model Evaluation
y_pred = rf_model.predict(X_test)

# Confusion Matrix
print('Confusion Matrix:')
confusion = confusion_matrix(y_test, y_pred)
print(confusion)

# Classification Report
print('\nClassification Report:')
report = classification_report(y_test, y_pred)
print(report)

Confusion Matrix:
[[1190   73]
 [ 206  334]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.94      0.90      1263
           1       0.82      0.62      0.71       540

    accuracy                           0.85      1803
   macro avg       0.84      0.78      0.80      1803
weighted avg       0.84      0.85      0.84      1803



In [9]:
# Feature Importance
feature_importances = rf_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)


                        Feature  Importance
27                Total_Revenue    0.147689
19                     Contract    0.131122
23                Total_Charges    0.124078
22               Monthly_Charge    0.091859
26  Total_Long_Distance_Charges    0.077840
1                           Age    0.064720
5              Tenure_in_Months    0.045550
3                         State    0.040167
4           Number_of_Referrals    0.037536
10                Internet_Type    0.032790
6                    Value_Deal    0.023935
21               Payment_Method    0.018914
11              Online_Security    0.016705
14              Premium_Support    0.016483
9              Internet_Service    0.013871
25     Total_Extra_Data_Charges    0.011523
20            Paperless_Billing    0.011205
2                       Married    0.009819
17              Streaming_Music    0.009665
0                        Gender    0.009568
16             Streaming_Movies    0.009547
8                Multiple_Lines 

In [13]:
# Predict on New Data

# File path
path = r'Data & Resources/Data/Prod_Data.xlsx'

# Sheet name
sheet_name = 'churndata'

# Read the data
new_data = pd.read_excel(path, sheet_name=sheet_name)

# Retain original data
original_data = new_data.copy()

# Retain customer_id
customer_ids = new_data['Customer_ID']

new_data = new_data.drop(['Customer_ID', 'Customer_Status', 'Churn_Category','Churn_Reason'], axis=1)

# Encode categorical variable
encoder = LabelEncoder()

# Encode categorical variable
for col in new_data.select_dtypes(include=['object']).columns:
    new_data[col] = encoder.fit_transform(new_data[col])

# Make predictions
new_predictions = rf_model.predict(new_data)

# Add predictions to original data
original_data['Customer_Status_Predicted'] = new_predictions

original_data = original_data[original_data['Customer_Status_Predicted'] == 1]

original_data.to_csv(r'Data & Resources/Data/predictions.csv', index=False)