In [1]:
#Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
# Specify the file path
file_path = r'C:\Users\AKSHITHA\Downloads\customer_churn_dataset-testing-master.csv\customer_churn_dataset-testing-master.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(df.head())


   CustomerID  Age  Gender  Tenure  Usage Frequency  Support Calls  \
0           1   22  Female      25               14              4   
1           2   41  Female      28               28              7   
2           3   47    Male      27               10              2   
3           4   35    Male       9               12              5   
4           5   53  Female      58               24              9   

   Payment Delay Subscription Type Contract Length  Total Spend  \
0             27             Basic         Monthly          598   
1             13          Standard         Monthly          584   
2             29           Premium          Annual          757   
3             17           Premium       Quarterly          232   
4              2          Standard          Annual          533   

   Last Interaction  Churn  
0                 9      1  
1                20      0  
2                21      0  
3                18      0  
4                18      0  


In [6]:
# Basic data exploration
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         64374 non-null  int64 
 1   Age                64374 non-null  int64 
 2   Gender             64374 non-null  object
 3   Tenure             64374 non-null  int64 
 4   Usage Frequency    64374 non-null  int64 
 5   Support Calls      64374 non-null  int64 
 6   Payment Delay      64374 non-null  int64 
 7   Subscription Type  64374 non-null  object
 8   Contract Length    64374 non-null  object
 9   Total Spend        64374 non-null  int64 
 10  Last Interaction   64374 non-null  int64 
 11  Churn              64374 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 5.9+ MB
None


In [11]:
# Handle missing values (Example: Fill with mean for numerical features)
def preprocess_data(df):
    """Preprocess the data: handle missing values and encode categorical variables."""
    # Fill missing values with the mean
    df.fillna(df.mean(), inplace=True)
    
    # One-hot encoding for categorical variables
    df = pd.get_dummies(df, columns=['Gender', 'Subscription Type', 'Contract Length'], drop_first=True)
    return df

In [13]:
print(df.head())

   CustomerID  Age  Gender  Tenure  Usage Frequency  Support Calls  \
0           1   22  Female      25               14              4   
1           2   41  Female      28               28              7   
2           3   47    Male      27               10              2   
3           4   35    Male       9               12              5   
4           5   53  Female      58               24              9   

   Payment Delay Subscription Type Contract Length  Total Spend  \
0             27             Basic         Monthly          598   
1             13          Standard         Monthly          584   
2             29           Premium          Annual          757   
3             17           Premium       Quarterly          232   
4              2          Standard          Annual          533   

   Last Interaction  Churn  
0                 9      1  
1                20      0  
2                21      0  
3                18      0  
4                18      0  


In [20]:
print("Column Names:", df.columns)

Column Names: Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency',
       'Support Calls', 'Payment Delay', 'Subscription Type',
       'Contract Length', 'Total Spend', 'Last Interaction', 'Churn'],
      dtype='object')


In [22]:
#stripping Whitespace from Column Names
df.columns = df.columns.str.strip()

In [26]:
#Checking for Missing Values
print("Missing values:\n", df.isnull().sum())

Missing values:
 CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64


In [27]:
#Filling Missing Values
df.fillna({
    'Total Spend': df['Total Spend'].mean(),
    'Support Calls': df['Support Calls'].mean(),
    'Payment Delay': df['Payment Delay'].mean(),
}, inplace=True)


In [29]:
print(df.head())

   CustomerID  Age  Gender  Tenure  Usage Frequency  Support Calls  \
0           1   22  Female      25               14              4   
1           2   41  Female      28               28              7   
2           3   47    Male      27               10              2   
3           4   35    Male       9               12              5   
4           5   53  Female      58               24              9   

   Payment Delay Subscription Type Contract Length  Total Spend  \
0             27             Basic         Monthly          598   
1             13          Standard         Monthly          584   
2             29           Premium          Annual          757   
3             17           Premium       Quarterly          232   
4              2          Standard          Annual          533   

   Last Interaction  Churn  
0                 9      1  
1                20      0  
2                21      0  
3                18      0  
4                18      0  


In [33]:
#Date Conversion
df['Last Interaction'] = pd.to_datetime(df['Last Interaction'], errors='coerce')

# Feature Engineering


In [37]:
df['Recency'] = (pd.to_datetime('now') - df['Last Interaction']).dt.days
print(df[['Last Interaction', 'Recency']].head())

               Last Interaction  Recency
0 1970-01-01 00:00:00.000000009    20023
1 1970-01-01 00:00:00.000000020    20023
2 1970-01-01 00:00:00.000000021    20023
3 1970-01-01 00:00:00.000000018    20023
4 1970-01-01 00:00:00.000000018    20023


In [38]:
df['AverageSpend'] = df['Total Spend'] / (df['Tenure'] + 1)  # Avoid division by zero

In [44]:
df['AverageSpend'].replace([np.inf, -np.inf], 0, inplace=True)
df['AverageSpend'].fillna(0, inplace=True)  # Fill NaN values with 0 if any

In [45]:
#Selecting Features and Target Variable
features = ['Age', 'Gender', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Recency', 'AverageSpend']
X = df[features]
y = df['Churn']  # Assuming Churn is the target variable


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(45061, 8) (19313, 8) (45061,) (19313,)


In [49]:
#Preprocessing Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Recency', 'AverageSpend']
categorical_features = ['Gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)


In [53]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
# Logistic Regression
#(citation - reference from analyticsvidhya,greeksfor greeks)
logistic_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Hyperparameter tuning for Logistic Regression
param_grid_logistic = {
    'classifier__C': [0.1, 1, 10],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__penalty': ['l1', 'l2']
}

logistic_grid = GridSearchCV(logistic_model, param_grid_logistic, cv=5)
logistic_grid.fit(X_train, y_train)

# After fitting, you can access best parameters
print("Best parameters for Logistic Regression:", logistic_grid.best_params_)


Best parameters for Logistic Regression: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}


In [54]:
y_pred_logistic = logistic_grid.predict(X_test)
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_logistic))


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.84      0.83      0.83     10134
           1       0.81      0.83      0.82      9179

    accuracy                           0.83     19313
   macro avg       0.83      0.83      0.83     19313
weighted avg       0.83      0.83      0.83     19313



In [55]:
#Random Forest Classifier
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])


In [56]:
#Hyperparameter Tuning for Random Forest 
#(citation : Reference from- chatgpt, geeksforgeeks)
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

rf_grid = GridSearchCV(rf_model, param_grid_rf, cv=5)
rf_grid.fit(X_train, y_train)
print("Best parameters for Random Forest:", rf_grid.best_params_)


Best parameters for Random Forest: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}


In [57]:
#Predictions and Evaluation for Random Forest
    y_pred_rf = rf_grid.predict(X_test)
print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))


Random Forest Results:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     10134
           1       0.94      0.99      0.96      9179

    accuracy                           0.97     19313
   macro avg       0.97      0.97      0.97     19313
weighted avg       0.97      0.97      0.97     19313



In [58]:
#Best Parameters Output
print("Best parameters for Logistic Regression:", logistic_grid.best_params_)
print("Best parameters for Random Forest:", rf_grid.best_params_)


Best parameters for Logistic Regression: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best parameters for Random Forest: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}
