In [2]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report



In [3]:
# Load Training & Testing Datasets
df_train = pd.read_csv(r"C:\Users\cex\Desktop\Data sets\customerchurn-training.csv")
df_test = pd.read_csv(r"C:\Users\cex\Desktop\Data sets\customerchurn-testing.csv")

# Encode Categorical Variables
label_encoders = {}
for col in df_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    label_encoders[col] = le  # Store for potential deployment use

# Define Features & Target
X_train = df_train.drop(columns=['Churn'])  # Features
y_train = df_train['Churn']  # Target

X_test = df_test.drop(columns=['Churn'])  # Features for final evaluation
y_test = df_test['Churn']  # Target for final evaluation

# Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Debugging: Check if data is properly loaded
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("First 5 rows of X_train:\n", X_train.head())



Shape of X_train: (440833, 11)
Shape of X_test: (64374, 11)
First 5 rows of X_train:
    CustomerID   Age  Gender  Tenure  Usage Frequency  Support Calls  \
0         2.0  30.0       0    39.0             14.0            5.0   
1         3.0  65.0       0    49.0              1.0           10.0   
2         4.0  55.0       0    14.0              4.0            6.0   
3         5.0  58.0       1    38.0             21.0            7.0   
4         6.0  23.0       1    32.0             20.0            5.0   

   Payment Delay  Subscription Type  Contract Length  Total Spend  \
0           18.0                  2                0        932.0   
1            8.0                  0                1        557.0   
2           18.0                  0                2        185.0   
3            7.0                  2                1        396.0   
4            8.0                  0                1        617.0   

   Last Interaction  
0              17.0  
1               6.0  
2     

In [6]:
# Check for NaN values in target variable
print("Missing values in y_train:", y_train.isnull().sum())

# Drop rows where target variable is NaN
df_train = df_train.dropna(subset=['Churn'])

# Define Features & Target again after dropping NaNs
X_train = df_train.drop(columns=['Churn'])  
y_train = df_train['Churn']

# Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)  # Should work now
y_pred_rf = rf_model.predict(X_test_scaled)

# Print accuracy to confirm
print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))



Missing values in y_train: 1

Random Forest Accuracy: 0.49467176189144685


In [7]:
# Check for NaN values in target variable
print("Missing values in y_train:", y_train.isnull().sum())
print("Missing values in y_test:", y_test.isnull().sum())

# Drop NaN values in y_train
df_train = df_train.dropna(subset=['Churn'])  

# Reassign X_train and y_train after cleaning
X_train = df_train.drop(columns=['Churn'])  
y_train = df_train['Churn']

# Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train to an array
y_train = y_train.values  # Ensures proper format for training

# Define Hyperparameter Grid
param_grid = {
    'n_estimators': [50, 100],  
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.2]
}

# Initialize and Train GridSearchCV
grid_search = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,  
    n_jobs=-1  
)
grid_search.fit(X_train_scaled, y_train)  # Should work now

# Best XGBoost Model After Tuning
best_xgb = grid_search.best_estimator_
y_pred_best_xgb = best_xgb.predict(X_test_scaled)

# Final Evaluation
print("\nTuned XGBoost Accuracy:", accuracy_score(y_test, y_pred_best_xgb))
print("\nTuned XGBoost Classification Report:\n", classification_report(y_test, y_pred_best_xgb))


Missing values in y_train: 0
Missing values in y_test: 0
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.




Tuned XGBoost Accuracy: 0.5026408177214403

Tuned XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.06      0.10     33881
           1       0.49      1.00      0.66     30493

    accuracy                           0.50     64374
   macro avg       0.74      0.53      0.38     64374
weighted avg       0.75      0.50      0.37     64374



In [9]:
# Hyperparameter Tuning for XGBoost
param_grid = {
    'n_estimators': [50, 100],  # Reduced for speed
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.2]
}

# Initialize and Train GridSearchCV
grid_search = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,  # Show progress
    n_jobs=-1   # Use all CPU cores
)
grid_search.fit(X_train_scaled, y_train)

# Best XGBoost Model After Tuning
best_xgb = grid_search.best_estimator_
y_pred_best_xgb = best_xgb.predict(X_test_scaled)

# Debugging: Check Predictions Exist
print("Checking y_pred_best_xgb:", y_pred_best_xgb[:10])  # First 10 predictions
print("Checking y_test:", y_test[:10])  # First 10 actual labels

# Final Evaluation
print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nTuned XGBoost Accuracy:", accuracy_score(y_test, y_pred_best_xgb))

print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\nTuned XGBoost Classification Report:\n", classification_report(y_test, y_pred_best_xgb))

Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.



Checking y_pred_best_xgb: [1 1 1 1 1 1 1 1 1 1]
Checking y_test: 0    1
1    0
2    0
3    0
4    0
5    0
6    1
7    0
8    0
9    0
Name: Churn, dtype: int64

Random Forest Accuracy: 0.49467176189144685

Tuned XGBoost Accuracy: 0.5026408177214403

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.04      0.08     33881
           1       0.48      1.00      0.65     30493

    accuracy                           0.49     64374
   macro avg       0.74      0.52      0.36     64374
weighted avg       0.75      0.49      0.35     64374


Tuned XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.06      0.10     33881
           1       0.49      1.00      0.66     30493

    accuracy                           0.50     64374
   macro avg       0.74      0.53      0.38     64374
weighted avg       0.75      0.50      0.37     64374

