<a href="https://colab.research.google.com/github/ArezooNajafi/Customer-Churn-in-Online-Retail/blob/main/Churn_rate_Predictive_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hassaneskikri/online-retail-customer-churn-dataset")

print("Path to dataset files:", path)

In [None]:
import os

# List all files inside the downloaded dataset folder
files = os.listdir(path)
print(files)


In [None]:
import pandas as pd

# Full path to the CSV file
csv_file_path = os.path.join(path, "online_retail_customer_churn.csv")

# Read it into a DataFrame
df = pd.read_csv(csv_file_path)

# Show first 5 rows
df.head()

In [None]:
df.info()

In [None]:
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# If duplicates exist:
df = df.drop_duplicates()


In [None]:
df.describe(include='all') # Use include='all' to include all columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Select numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Set up subplot grid size
n_cols = 3  # 3 plots per row
n_rows = math.ceil(len(numeric_cols) / n_cols)

# Set figure size
plt.figure(figsize=(n_cols * 5, n_rows * 4))

# Loop through each column and create a subplot
for idx, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, idx + 1)
    sns.histplot(df[col], kde=True, bins=30, color='skyblue')
    plt.title(f'{col}', fontsize=10)
    plt.xlabel('')
    plt.ylabel('')

plt.tight_layout()
plt.show()


In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Set up subplot grid size
n_cols = 3  # 3 plots per row
n_rows = math.ceil(len(numeric_cols) / n_cols)

# Set figure size
plt.figure(figsize=(n_cols * 5, n_rows * 4))

# Loop through each column and create a boxplot
for idx, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, idx + 1)
    sns.boxplot(y=df[col], color='lightcoral')
    plt.title(f'{col}', fontsize=10)
    plt.xlabel('')
    plt.ylabel('')

plt.tight_layout()
plt.show()

In [None]:
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

# Now, apply it to all numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Create a summary
outlier_summary = {}

for col in numeric_cols:
    outliers = detect_outliers_iqr(df, col)
    outlier_summary[col] = len(outliers)

# Print nicely
for col, n_outliers in outlier_summary.items():
    print(f"{col}: {n_outliers} outliers")


In [None]:
data=df.drop(['Customer_ID'], axis=1)


In [None]:
# Step 2: Create full dummies for Gender and Promotion_Response (no drop)
data = pd.get_dummies(data, columns=['Gender', 'Promotion_Response'], drop_first=False).astype(int)

# Step 3: Convert Email_Opt_In from bool to int (True/False → 1/0)
data['Email_Opt_In'] = data['Email_Opt_In'].astype(int)

In [None]:
data.head()

In [None]:
x=data.drop(['Target_Churn'], axis=1)
y=data['Target_Churn']

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
def training(model, name, axes):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    score = accuracy_score(y_test, pred)
    report = classification_report(y_test, pred)
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, ax=axes)
    axes.set_title("{}: {}%".format(name, round(score*100, 2)))
    return report

In [None]:
log_reg = LogisticRegression()
rfc = RandomForestClassifier()
svc = SVC(C=0.9)
xgb = XGBClassifier(learning_rate=0.9)

# Model list
models = [log_reg, rfc, svc, xgb]
names = ["Logistic Regression", "Random Forest", "Support Vector Machine", "XGBoost"]

In [None]:
reports = []
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 6))
row = 0
col = 0
for i, j in zip(models, names):
    if col >= 2:
        row += 1
        col = 0
    reports += [training(i, j, axes[row][col])]
    col += 1

In [None]:
# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42, max_depth=5)

# Train the model
rf.fit(x_train, y_train)

# Predict on the test set
y_pred_rf = rf.predict(x_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


In [None]:
# Define hyperparameters for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(x_train, y_train)

# Get best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


In [None]:
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Step 1: Cross-validation on training data (without hyperparameter tuning)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(rf, x_train, y_train, cv=5, scoring=scoring)

# Cross-validation results output
print("=== Cross-Validation Scores (Train Set) ===")
for metric in scoring:
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize()}: {mean:.2f} (+/- {std:.2f})")

# Step 2: Hyperparameter tuning using GridSearchCV after cross-validation
param_grid = {
    'max_depth': [5, 10, 15, 20],
    'n_estimators': [50, 100, 200],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

print("\nBest Hyperparameters from Grid Search:", grid_search.best_params_)

# Step 3: Fit model with best parameters (after tuning)
best_rf = grid_search.best_estimator_

# Step 4: Predict on test set
y_pred = best_rf.predict(x_test)

# Step 5: Evaluate on test set
print("\n=== Final Evaluation on Test Set ===")
print("Random Forest Accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Fit the model (already done)
rf.fit(x_train, y_train)

# Get feature importance
feature_importances = rf.feature_importances_

# Create a DataFrame for feature importances
features_df = pd.DataFrame({
    'Feature': x_train.columns,
    'Importance': feature_importances
})

# Sort features by importance
features_df = features_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(features_df['Feature'], features_df['Importance'])
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.show()

# Print the features with their importance
print(features_df)


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)  # Select top 10 features
x_new = selector.fit_transform(x, y)

selected_features = x.columns[selector.get_support()]
print("Selected features:", selected_features.tolist())


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Step 1: Select features using Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to determine feature importance
model.fit(x, y)

# Select important features
selector = SelectFromModel(model, threshold="mean", max_features=10)
x_selected = selector.transform(x)

# Get the selected feature names
selected_features = x.columns[selector.get_support()]
print("Selected features:", selected_features.tolist())

# Step 2: Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_selected, y, test_size=0.3, random_state=42)

# Step 3: Train the Random Forest model using the selected features
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

# Step 4: Make predictions on the test set
y_pred = rf_model.predict(x_test)

# Step 5: Evaluate the model using various metrics

# Overall accuracy
accuracy = accuracy_score(y_test, y_pred)

# Precision, Recall, and F1-Score for all classes
precision = precision_score(y_test, y_pred, average='macro')  # or 'micro' or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')  # or 'micro' or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')  # or 'micro' or 'weighted'

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Detailed classification report
class_report = classification_report(y_test, y_pred)

# Step 6: Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro): {recall:.4f}")
print(f"F1-Score (macro): {f1:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print("\nClassification Report:\n", class_report)


In [None]:
# Evaluate the model on training data
y_train_pred = rf_model.predict(x_train)
training_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {training_accuracy:.4f}")

# Evaluate the model on test data
y_test_pred = rf_model.predict(x_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


In [None]:
rf_model = RandomForestClassifier(
    n_estimators=50,         # Fewer trees
    max_depth=5,             # Shallower trees
    min_samples_split=10,    # More general splits
    min_samples_leaf=5,      # Larger leaf size
    random_state=42
)
rf_model.fit(x_train, y_train)


In [None]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf_model, x, y, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")


In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming X is your feature matrix
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)  # Keep 95% of the variance
X_pca = pca.fit_transform(X_scaled)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_pca, y)  # Assuming y is your target variable


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Assuming X and y are your feature matrix and target variable
X_scaled = StandardScaler().fit_transform(x)  # Standardize the data
pca = PCA(n_components=0.95)  # Apply PCA to reduce dimensionality
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Initialize the RandomForest model
model = RandomForestClassifier(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on both training and test sets
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# Calculate performance metrics for training data
train_accuracy = accuracy_score(y_train, train_preds)
train_precision = precision_score(y_train, train_preds)
train_recall = recall_score(y_train, train_preds)
train_f1 = f1_score(y_train, train_preds)
train_roc_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])

# Calculate performance metrics for test data
test_accuracy = accuracy_score(y_test, test_preds)
test_precision = precision_score(y_test, test_preds)
test_recall = recall_score(y_test, test_preds)
test_f1 = f1_score(y_test, test_preds)
test_roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# Print the performance metrics for training and test sets
print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")
print(f"ROC AUC: {train_roc_auc:.4f}")

print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"ROC AUC: {test_roc_auc:.4f}")

# Optionally, use KFold cross-validation to evaluate the model's performance across multiple splits of the training data
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cross_val_accuracies = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')
cross_val_precisions = cross_val_score(model, X_train, y_train, cv=kf, scoring='precision')
cross_val_recalls = cross_val_score(model, X_train, y_train, cv=kf, scoring='recall')
cross_val_f1s = cross_val_score(model, X_train, y_train, cv=kf, scoring='f1')
cross_val_roc_aucs = cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc')

# Print the cross-validation results
print("\nCross-Validation Metrics (5-fold):")
print(f"Accuracy: {np.mean(cross_val_accuracies):.4f}")
print(f"Precision: {np.mean(cross_val_precisions):.4f}")
print(f"Recall: {np.mean(cross_val_recalls):.4f}")
print(f"F1 Score: {np.mean(cross_val_f1s):.4f}")
print(f"ROC AUC: {np.mean(cross_val_roc_aucs):.4f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn import tree

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5, random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predictions on test set
y_pred = dt_model.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,              # control tree depth
    min_samples_leaf=10,      # ensure each leaf has enough data
    max_features='sqrt',      # limit features per split
    random_state=42
)


Gradient Boosting


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_validate
import numpy as np


In [None]:
# Initialize Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)

# Train the model
gb.fit(x_train, y_train)

# Predict on the test set
y_pred_gb = gb.predict(x_test)

# Evaluate the model
print("Gradient Boosting Accuracy: %.2f" % accuracy_score(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))


In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Best parameters and score
print("Best Hyperparameters from Grid Search:", grid_search.best_params_)
print("Best Accuracy from Grid Search: %.2f" % grid_search.best_score_)


In [None]:
# Train the model with the best parameters
best_gb = grid_search.best_estimator_

# Predict on the test set
y_pred_best_gb = best_gb.predict(x_test)

# Evaluate the model
print("\n=== Final Evaluation on Test Set ===")
print("Gradient Boosting Accuracy: %.2f" % accuracy_score(y_test, y_pred_best_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_best_gb))


In [None]:
# Feature importance
importances = best_gb.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(x_train.shape[1]):
    print(f"{f + 1}. feature {x_train.columns[indices[f]]} ({importances[indices[f]]})")


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Set up logistic regression
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Use stratified cross-validation to maintain class balance in folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define metrics
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Step 1: Cross-validation
cv_results = cross_validate(logreg, x_train, y_train, cv=cv, scoring=scoring)

print("=== Cross-Validation Scores (Train Set) ===")
for metric in scoring:
    mean = np.mean(cv_results[f'test_{metric}'])
    std = np.std(cv_results[f'test_{metric}'])
    print(f"{metric.capitalize()}: {mean:.2f} (+/- {std:.2f})")

# Step 2: Fit model on full training set
logreg.fit(x_train, y_train)

# Step 3: Predict on test set
y_pred = logreg.predict(x_test)

# Step 4: Evaluate on test set
print("\n=== Final Evaluation on Test Set ===")
print("Logistic Regression Accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
