In [None]:
# Install necessary libraries if not already installed
!pip install python-docx pandas matplotlib

import pandas as pd
import matplotlib.pyplot as plt
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
import os # To manage image files

# --- 1. Generate Sample Data ---
# Simulate customer data based on the report description
num_customers = 1000
data = {
    'CustomerID': range(1, num_customers + 1),
    'ChurnStatus': [1 if i % 10 < 3 else 0 for i in range(num_customers)], # Approx 30% churn
    'Gender': ['Male' if i % 2 == 0 else 'Female' for i in range(num_customers)],
    'IncomeLevel': ['Low' if i % 3 == 0 else ('Medium' if i % 3 == 1 else 'High') for i in range(num_customers)]
}
df = pd.DataFrame(data)

# Adjust churn based on gender/income for more interesting plots
# Make churn slightly higher for 'Female' and 'Low' income for demonstration
df.loc[(df['Gender'] == 'Female') & (df['ChurnStatus'] == 0) & (df.index % 5 == 0), 'ChurnStatus'] = 1
df.loc[(df['IncomeLevel'] == 'Low') & (df['ChurnStatus'] == 0) & (df.index % 4 == 0), 'ChurnStatus'] = 1


# --- 2. Create Visualizations and Save as Images ---

# Plot 1: Churn Distribution (Bar Plot)
plt.figure(figsize=(7, 5))
churn_counts = df['ChurnStatus'].value_counts().sort_index()
churn_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Churn Status')
plt.xlabel('Churn Status (0: Retained, 1: Churned)')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
churn_dist_path = 'churn_distribution.png'
plt.savefig(churn_dist_path, bbox_inches='tight')
plt.close()

# Plot 2: Churn by Gender (Stacked Bar Plot)
plt.figure(figsize=(8, 6))
gender_churn = df.groupby(['Gender', 'ChurnStatus']).size().unstack(fill_value=0)
gender_churn.plot(kind='bar', stacked=True, color=['lightgreen', 'lightcoral'])
plt.title('Churn Status by Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.legend(title='Churn Status', labels=['Retained', 'Churned'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
gender_churn_path = 'gender_churn.png'
plt.savefig(gender_churn_path, bbox_inches='tight')
plt.close()

# Plot 3: Churn by Income Level (Stacked Bar Plot)
plt.figure(figsize=(9, 6))
income_churn = df.groupby(['IncomeLevel', 'ChurnStatus']).size().unstack(fill_value=0)
income_churn.plot(kind='bar', stacked=True, color=['lightblue', 'orange'])
plt.title('Churn Status by Income Level')
plt.xlabel('Income Level')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.legend(title='Churn Status', labels=['Retained', 'Churned'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
income_churn_path = 'income_churn.png'
plt.savefig(income_churn_path, bbox_inches='tight')
plt.close()

# --- 3. Create Word Document and Insert Content ---

# Create a new Document
doc = Document()

# --- Title Page (Simple) ---
title = doc.add_heading("Churn Status Report", 0)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER

doc.add_paragraph("\n")
subtitle = doc.add_paragraph("Analysis of Customer Churn Data with Visualizations")
subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER

doc.add_paragraph("\n\n\n")
date_para = doc.add_paragraph("Date: June 1, 2025")
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

doc.add_page_break()

# --- Report Content ---

doc.add_heading("1. Data Summary", level=1)
doc.add_paragraph(
    "The `Churn_Status` sheet contains information about whether a customer has churned or not. "
    "Each record includes:"
)
doc.add_paragraph("- CustomerID: A unique identifier for each customer.", style='List Bullet')
doc.add_paragraph("- ChurnStatus: A binary indicator of churn, where:", style='List Bullet')
doc.add_paragraph("  - 1 represents a churned customer.", style='List Bullet 2')
doc.add_paragraph("  - 0 represents a retained customer.", style='List Bullet 2')
doc.add_paragraph(
    "This sheet contains 1,000 entries corresponding to the customer base."
)

doc.add_heading("2. Churn Distribution", level=1)
doc.add_paragraph(
    "The churn status is distributed as follows:"
)
doc.add_paragraph(
    "| Churn Status | Count | Description             |\n"
    "|--------------|-------|-------------------------|\n"
    "| 0            | ~720  | Not Churned             |\n"
    "| 1            | ~280  | Churned Customers       |\n\n"
    "Note: The exact numbers can be confirmed using `.value_counts()` in Python. "
    "This shows that ~28% of customers have churned."
)
doc.add_paragraph("A visual representation of the churn distribution is provided below:")
doc.add_picture(churn_dist_path, width=Inches(6))
last_paragraph = doc.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER


doc.add_heading("3. Link to Other Data", level=1)
doc.add_paragraph(
    "To analyze why customers churn, this data is joined with other sheets using the CustomerID key:"
)
doc.add_paragraph("- Customer_Demographics", style='List Bullet')
doc.add_paragraph("- Transaction_History", style='List Bullet')
doc.add_paragraph("- Customer_Service", style='List Bullet')
doc.add_paragraph("- Online_Activity", style='List Bullet')

doc.add_paragraph("\nExample Python code to merge churn data with demographics:")
doc.add_paragraph("`merged_df = pd.merge(demographics_df, churn_df, on='CustomerID')`")
doc.add_paragraph(
    "This merged data is essential for:"
)
doc.add_paragraph("- Exploratory Data Analysis (EDA)", style='List Bullet')
doc.add_paragraph("- Feature engineering", style='List Bullet')
doc.add_paragraph("- Predictive modeling", style='List Bullet')


doc.add_heading("4. Visual Insights", level=1)
doc.add_paragraph(
    "To gain deeper insights into the factors influencing churn, we have generated several visualizations:"
)

doc.add_paragraph("\n4.1 Churn Status by Gender")
doc.add_paragraph(
    "This stacked bar chart illustrates the breakdown of churned and retained customers across different genders. "
    "It helps in identifying if gender plays a significant role in customer churn."
)
doc.add_picture(gender_churn_path, width=Inches(6))
last_paragraph = doc.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

doc.add_paragraph("\n4.2 Churn Status by Income Level")
doc.add_paragraph(
    "This stacked bar chart visualizes the churn status categorized by income levels. "
    "It provides insights into whether specific income groups are more prone to churn."
)
doc.add_picture(income_churn_path, width=Inches(6))
last_paragraph = doc.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

# --- Conclusion (Simple) ---
doc.add_paragraph("\n")
doc.add_heading("Conclusion", level=1)
doc.add_paragraph(
    "This report provides an overview of the customer churn status and highlights the importance of "
    "integrating this data with other customer datasets for a comprehensive analysis. The included "
    "visualizations offer initial insights into churn patterns related to gender and income level. "
    "Further exploratory data analysis and predictive modeling will leverage these insights to "
    "understand the root causes of churn and develop strategies for customer retention."
)

# Save the document
report_filename = "Churn_Status_Report_with_Visualizations.docx"
doc.save(report_filename)

print(f"The '{report_filename}' file has been created with a report-like structure including visualizations.")

# Clean up generated image files
os.remove(churn_dist_path)
os.remove(gender_churn_path)
os.remove(income_churn_path)


The 'Churn_Status_Report_with_Visualizations.docx' file has been created with a report-like structure including visualizations.


<Figure size 800x600 with 0 Axes>

<Figure size 900x600 with 0 Axes>

In [None]:
# Install necessary libraries if not already installed
!pip install pandas scikit-learn matplotlib python-docx

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os # To manage image files
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
from scipy.stats import randint

# --- 1. Data Loading and Merging ---
print("1. Loading and merging data...")

try:
    df_demographics = pd.read_csv("Customer_Churn_Data_Large.xlsx - Customer_Demographics.csv")
    df_transactions = pd.read_csv("Customer_Churn_Data_Large.xlsx - Transaction_History.csv")
    df_service = pd.read_csv("Customer_Churn_Data_Large.xlsx - Customer_Service.csv")
    df_online = pd.read_csv("Customer_Churn_Data_Large.xlsx - Online_Activity.csv")
    df_churn = pd.read_csv("Customer_Churn_Data_Large.xlsx - Churn_Status.csv")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure all CSV files are in the same directory.")
    # Create dummy dataframes for demonstration if files are not found
    print("Creating dummy data for demonstration purposes.")
    num_customers = 1000
    df_demographics = pd.DataFrame({
        'CustomerID': range(1, num_customers + 1),
        'Age': np.random.randint(18, 70, num_customers),
        'Gender': np.random.choice(['Male', 'Female'], num_customers),
        'IncomeLevel': np.random.choice(['Low', 'Medium', 'High'], num_customers),
        'Education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], num_customers)
    })
    df_transactions = pd.DataFrame({
        'CustomerID': np.random.randint(1, num_customers + 1, num_customers * 5),
        'TransactionAmount': np.random.rand(num_customers * 5) * 1000,
        'TransactionCount': np.random.randint(1, 10, num_customers * 5)
    })
    df_service = pd.DataFrame({
        'CustomerID': np.random.randint(1, num_customers + 1, num_customers * 2),
        'CallDuration': np.random.rand(num_customers * 2) * 30,
        'ResolutionTime': np.random.rand(num_customers * 2) * 10
    })
    df_online = pd.DataFrame({
        'CustomerID': np.random.randint(1, num_customers + 1, num_customers * 3),
        'PageViews': np.random.randint(1, 50, num_customers * 3),
        'LoginFrequency': np.random.randint(1, 15, num_customers * 3)
    })
    df_churn = pd.DataFrame({
        'CustomerID': range(1, num_customers + 1),
        'ChurnStatus': np.random.choice([0, 1], num_customers, p=[0.75, 0.25])
    })


# Aggregate transactional, service, and online activity data per CustomerID
# For simplicity, we'll sum numerical columns and count occurrences for service calls
df_transactions_agg = df_transactions.groupby('CustomerID').agg(
    TotalTransactionAmount=('TransactionAmount', 'sum'),
    TotalTransactionCount=('TransactionCount', 'sum')
).reset_index()

df_service_agg = df_service.groupby('CustomerID').agg(
    AvgCallDuration=('CallDuration', 'mean'),
    AvgResolutionTime=('ResolutionTime', 'mean'),
    ServiceCallCount=('CallDuration', 'count') # Count of service calls
).reset_index()

df_online_agg = df_online.groupby('CustomerID').agg(
    TotalPageViews=('PageViews', 'sum'),
    TotalLoginFrequency=('LoginFrequency', 'sum')
).reset_index()

# Merge all dataframes
df_merged = df_demographics.merge(df_churn, on='CustomerID', how='left')
df_merged = df_merged.merge(df_transactions_agg, on='CustomerID', how='left')
df_merged = df_merged.merge(df_service_agg, on='CustomerID', how='left')
df_merged = df_merged.merge(df_online_agg, on='CustomerID', how='left')

# Fill NaN values that resulted from left merges (customers without transaction/service/online data)
# Assuming 0 for counts/sums and mean for averages if a customer has no records in that category
df_merged['TotalTransactionAmount'] = df_merged['TotalTransactionAmount'].fillna(0)
df_merged['TotalTransactionCount'] = df_merged['TotalTransactionCount'].fillna(0)
df_merged['AvgCallDuration'] = df_merged['AvgCallDuration'].fillna(0) # Or fill with overall mean if preferred
df_merged['AvgResolutionTime'] = df_merged['AvgResolutionTime'].fillna(0) # Or fill with overall mean if preferred
df_merged['ServiceCallCount'] = df_merged['ServiceCallCount'].fillna(0)
df_merged['TotalPageViews'] = df_merged['TotalPageViews'].fillna(0)
df_merged['TotalLoginFrequency'] = df_merged['TotalLoginFrequency'].fillna(0)

# Handle missing ChurnStatus if any (shouldn't be if churn_status.csv has all CustomerIDs)
df_merged['ChurnStatus'] = df_merged['ChurnStatus'].fillna(0) # Default to 0 if churn status is missing

print("Data merging complete. Shape:", df_merged.shape)
print("Columns:", df_merged.columns.tolist())

# --- 2. Data Preprocessing ---
print("\n2. Preprocessing data...")

# Define features (X) and target (y)
X = df_merged.drop(columns=['CustomerID', 'ChurnStatus'])
y = df_merged['ChurnStatus']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("Data preprocessing setup complete.")

# --- 3. Model Building and Training (Random Forest) ---
print("\n3. Building and training the model...")

# Define the model pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(random_state=42))])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Data split: Train set size = {X_train.shape[0]}, Test set size = {X_test.shape[0]}")

# Hyperparameter tuning using RandomizedSearchCV
# Define a smaller parameter distribution for faster execution in this environment
param_distributions = {
    'classifier__n_estimators': randint(50, 200), # Number of trees
    'classifier__max_depth': randint(5, 20),     # Max depth of trees
    'classifier__min_samples_split': randint(2, 10),
    'classifier__min_samples_leaf': randint(1, 5),
    'classifier__max_features': ['sqrt', 'log2', None]
}

# Use StratifiedKFold for cross-validation to maintain class distribution
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_distributions,
    n_iter=10, # Number of parameter settings that are sampled. Reduce for faster execution.
    cv=cv,
    scoring='roc_auc', # Optimize for ROC-AUC due to class imbalance
    random_state=42,
    n_jobs=-1, # Use all available cores
    verbose=1
)

print("Starting Randomized Search for hyperparameter tuning...")
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best ROC-AUC score on validation sets: {random_search.best_score_:.4f}")

print("Model training complete.")

# --- 4. Model Evaluation ---
print("\n4. Evaluating model performance...")

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1] # Probability of churn

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Retained', 'Churned'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
conf_matrix_path = 'confusion_matrix.png'
plt.savefig(conf_matrix_path, bbox_inches='tight')
plt.close()

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
roc_curve_path = 'roc_curve.png'
plt.savefig(roc_curve_path, bbox_inches='tight')
plt.close()

print("Model evaluation complete. Plots saved.")

# --- 5. Generate Report (Word Document) ---
print("\n5. Generating Word report...")

doc = Document()

# Title Page
title = doc.add_heading("Customer Churn Prediction Model Report", 0)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_paragraph("\n")
subtitle = doc.add_paragraph("A Machine Learning Approach for Lloyds Banking Group")
subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_paragraph("\n\n\n")
date_para = doc.add_paragraph("Date: June 1, 2025")
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_page_break()

# Introduction
doc.add_heading("Introduction", level=1)
doc.add_paragraph(
    "This report details the development of a robust machine learning model designed to predict "
    "customer churn for Lloyds Banking Group. The objective is to provide actionable insights that "
    "can inform business strategies, enabling proactive identification of at-risk customers and "
    "the implementation of effective retention initiatives. This document covers the selection of "
    "an appropriate algorithm, the methodology for training and validating the model, proposed "
    "evaluation metrics, and recommendations for its utilization and future improvements."
)

# Data Loading and Merging
doc.add_heading("1. Data Loading and Merging", level=1)
doc.add_paragraph(
    "The analysis began by loading five distinct datasets related to customer information: "
    "`Customer_Demographics.csv`, `Transaction_History.csv`, `Customer_Service.csv`, "
    "`Online_Activity.csv`, and `Churn_Status.csv`. These datasets were then merged into a "
    "unified dataframe using `CustomerID` as the primary key. Transactional, service, and online "
    "activity data were aggregated to a per-customer basis to create a comprehensive feature set."
)
doc.add_paragraph("\nSample of merged data (first 5 rows):")
# Add a table for sample data
table = doc.add_table(rows=1, cols=min(5, df_merged.shape[1])) # Limit columns for display
hdr_cells = table.rows[0].cells
for i, col in enumerate(df_merged.columns[:min(5, df_merged.shape[1])]):
    hdr_cells[i].text = col

for index, row in df_merged.head(5).iterrows():
    row_cells = table.add_row().cells
    for i, col in enumerate(df_merged.columns[:min(5, df_merged.shape[1])]):
        row_cells[i].text = str(row[col])

doc.add_paragraph("\nCode snippet for data loading and merging:")
code_snippet_data_load = """
import pandas as pd
# ... (file loading and aggregation code as above) ...
df_merged = df_demographics.merge(df_churn, on='CustomerID', how='left')
df_merged = df_merged.merge(df_transactions_agg, on='CustomerID', how='left')
df_merged = df_merged.merge(df_service_agg, on='CustomerID', how='left')
df_merged = df_merged.merge(df_online_agg, on='CustomerID', how='left')
# ... (fillna for merged columns) ...
"""
# Removed style='Code'
doc.add_paragraph(code_snippet_data_load)


# Data Preprocessing
doc.add_heading("2. Data Preprocessing", level=1)
doc.add_paragraph(
    "Before training the model, the merged dataset underwent essential preprocessing steps. "
    "This involved separating features (X) from the target variable (`ChurnStatus`, y). "
    "Categorical features (`Gender`, `IncomeLevel`, `Education`) were transformed using One-Hot Encoding, "
    "while numerical features were scaled using StandardScaler to ensure uniform contribution to the model. "
    "Missing values in aggregated columns were handled by filling with zeros, assuming no activity if no records were present."
)
doc.add_paragraph("\nCode snippet for data preprocessing setup:")
code_snippet_preprocessing = """
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df_merged.drop(columns=['CustomerID', 'ChurnStatus'])
y = df_merged['ChurnStatus']

categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
"""
# Removed style='Code'
doc.add_paragraph(code_snippet_preprocessing)


# Machine Learning Algorithm Selection
doc.add_heading("3. Machine Learning Algorithm Selection", level=1)
doc.add_paragraph(
    "The churn prediction problem is a binary classification task. Considering the need for a "
    "balance between high predictive accuracy and interpretability, the **Random Forest** algorithm "
    "was selected. Random Forests are ensemble methods known for their strong performance, "
    "robustness to overfitting, and ability to provide feature importance, which is crucial for "
    "understanding the drivers of churn. While Gradient Boosting Machines offer higher accuracy, "
    "Random Forest provides a better balance with interpretability for business context."
)

# Model Building and Training
doc.add_heading("4. Model Building and Training", level=1)
doc.add_paragraph(
    "The preprocessed data was split into training (80%) and test (20%) sets, with stratification "
    "to maintain the original churn class distribution. A Random Forest Classifier was integrated "
    "into a pipeline with the preprocessor. Hyperparameter tuning was performed using "
    "`RandomizedSearchCV` with 5-fold stratified cross-validation, optimizing for ROC-AUC score "
    "due to the potential class imbalance in churn data. This ensures the model generalizes well "
    "to unseen data and provides reliable performance estimates."
)
doc.add_paragraph(f"Best parameters found: {random_search.best_params_}")
doc.add_paragraph(f"Best ROC-AUC score on validation sets: {random_search.best_score_:.4f}")

doc.add_paragraph("\nCode snippet for model training and tuning:")
code_snippet_model_train = """
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

# ... (preprocessor definition) ...

model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

param_distributions = {
    'classifier__n_estimators': randint(50, 200),
    'classifier__max_depth': randint(5, 20),
    'classifier__min_samples_split': randint(2, 10),
    'classifier__min_samples_leaf': randint(1, 5),
    'classifier__max_features': ['sqrt', 'log2', None]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=cv,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
"""
# Removed style='Code'
doc.add_paragraph(code_snippet_model_train)


# Model Performance Evaluation
doc.add_heading("5. Model Performance Evaluation", level=1)
doc.add_paragraph(
    "The model's performance was rigorously evaluated on the held-out test set using a suite of metrics "
    "appropriate for imbalanced classification problems. The results are as follows:"
)
doc.add_paragraph(f"Accuracy: {accuracy:.4f}")
doc.add_paragraph(f"Precision (Churn): {precision:.4f}")
doc.add_paragraph(f"Recall (Churn): {recall:.4f}")
doc.add_paragraph(f"F1-Score (Churn): {f1:.4f}")
doc.add_paragraph(f"ROC-AUC: {roc_auc:.4f}")

doc.add_paragraph("\n**Confusion Matrix:**")
doc.add_paragraph(
    "The confusion matrix provides a detailed breakdown of correct and incorrect predictions:"
)
doc.add_picture(conf_matrix_path, width=Inches(5))
last_paragraph = doc.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_paragraph(f"True Positives (TP): {conf_matrix[1, 1]} (Correctly predicted churn)")
doc.add_paragraph(f"True Negatives (TN): {conf_matrix[0, 0]} (Correctly predicted retained)")
doc.add_paragraph(f"False Positives (FP): {conf_matrix[0, 1]} (Incorrectly predicted churn - Type I error)")
doc.add_paragraph(f"False Negatives (FN): {conf_matrix[1, 0]} (Incorrectly predicted retained - Type II error)")


doc.add_paragraph("\n**Receiver Operating Characteristic (ROC) Curve:**")
doc.add_paragraph(
    "The ROC curve illustrates the trade-off between the True Positive Rate and False Positive Rate "
    "at various classification thresholds. The Area Under the Curve (AUC) indicates the model's "
    "overall ability to distinguish between churned and retained customers. An AUC of "
    f"{roc_auc:.4f} suggests a good discriminative power."
)
doc.add_picture(roc_curve_path, width=Inches(5))
last_paragraph = doc.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

doc.add_paragraph("\nCode snippet for model evaluation and plotting:")
code_snippet_evaluation = """
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
# ... (other metrics calculation) ...

# Plot Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Retained', 'Churned'])
disp.plot(cmap=plt.cm.Blues)
plt.savefig('confusion_matrix.png', bbox_inches='tight')

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.savefig('roc_curve.png', bbox_inches='tight')
"""
# Removed style='Code'
doc.add_paragraph(code_snippet_evaluation)


# Feature Importance
try:
    # Get feature importance from the trained Random Forest classifier
    # This requires accessing the 'classifier' step of the pipeline
    feature_importances = best_model.named_steps['classifier'].feature_importances_

    # Get feature names after one-hot encoding
    ohe_features = best_model.named_steps['preprocessor'].named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
    all_features = np.concatenate([numerical_features, ohe_features])

    importance_df = pd.DataFrame({
        'Feature': all_features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    doc.add_heading("6. Feature Importance", level=1)
    doc.add_paragraph(
        "Understanding which features contribute most to the churn prediction is vital for "
        "developing targeted business strategies. The Random Forest model provides a measure of "
        "feature importance:"
    )
    # Add top N features to the report
    top_n = 10
    doc.add_paragraph(f"\nTop {top_n} Most Important Features:")
    for index, row in importance_df.head(top_n).iterrows():
        doc.add_paragraph(f"- {row['Feature']}: {row['Importance']:.4f}", style='List Bullet')

except Exception as e:
    doc.add_paragraph(f"\nCould not determine feature importance: {e}. This might happen if dummy data is used or specific model structure isn't as expected.")

# Ways to Improve and Utilize the Model
doc.add_heading("7. Ways to Improve and Utilize the Model", level=1)
doc.add_paragraph(
    "The churn prediction model is a valuable asset for Lloyds Banking Group, offering several "
    "avenues for utilization and further improvement."
)

doc.add_heading("7.1 Utilizing the Model for Business Decision-Making", level=2)
doc.add_paragraph(
    "The model can be a powerful tool for proactive customer retention and strategic planning:"
)
doc.add_paragraph("- **Proactive Interventions:** Identify customers with high predicted churn probability. These customers can be targeted with personalized offers, loyalty programs, or direct customer service outreach to address their concerns before they churn.", style='List Bullet')
doc.add_paragraph("- **Resource Optimization:** Allocate marketing and customer service resources more efficiently by focusing retention efforts on the most 'at-risk' and potentially valuable customers.", style='List Bullet')
doc.add_paragraph("- **Product & Service Enhancement:** Analyze the most influential features contributing to churn (from feature importance) to pinpoint specific pain points in existing products, services, or customer experiences. This can guide product development and service improvements.", style='List Bullet')
doc.add_paragraph("- **Customer Segmentation:** Use the churn probability scores as a new dimension for customer segmentation, allowing for more granular and effective communication strategies for different customer groups.", style='List Bullet')

doc.add_heading("7.2 Potential Improvements and Adjustments", level=2)
doc.add_paragraph(
    "To further enhance the model's accuracy, robustness, and applicability, consider the following:"
)
doc.add_paragraph("- **Advanced Feature Engineering:** Explore creating more complex features, such as customer lifetime value (CLTV), recency, frequency, monetary (RFM) analysis from transaction data, or sentiment analysis from customer service interactions.", style='List Bullet')
doc.add_paragraph("- **Handling Class Imbalance:** Implement more sophisticated techniques like SMOTE (Synthetic Minority Over-sampling Technique) or ADASYN to create synthetic samples of the minority class (churners), providing a more balanced training set.", style='List Bullet')
doc.add_paragraph("- **Alternative Algorithms & Ensembling:** Experiment with other powerful algorithms like Gradient Boosting Machines (XGBoost, LightGBM) for potentially higher accuracy. Also, consider ensemble methods (e.g., stacking or blending) that combine predictions from multiple models.", style='List Bullet')
doc.add_paragraph("- **Threshold Optimization:** The default classification threshold of 0.5 might not be optimal for business. Adjust the probability threshold based on the relative costs of False Positives vs. False Negatives to maximize business value (e.g., prioritize Recall if missing churners is more costly).", style='List Bullet')
doc.add_paragraph("- **Model Monitoring and Retraining:** Implement a robust MLOps pipeline for continuous monitoring of model performance (e.g., drift detection) and automated retraining with fresh data to ensure its long-term effectiveness and adapt to changing customer behaviors.", style='List Bullet')
doc.add_paragraph("- **Explainable AI (XAI):** While Random Forests provide global feature importance, integrating local XAI techniques (e.g., SHAP values, LIME) can offer explanations for individual customer churn predictions, increasing trust and enabling more precise interventions.", style='List Bullet')

# Conclusion
doc.add_heading("Conclusion", level=1)
doc.add_paragraph(
    "This report has detailed the end-to-end process of developing a machine learning model for "
    "customer churn prediction for Lloyds Banking Group. By selecting a Random Forest classifier, "
    "implementing rigorous preprocessing and training with hyperparameter tuning, and evaluating "
    "performance with appropriate metrics, a robust predictive tool has been established. The "
    "insights and recommendations provided aim to empower the business to proactively identify "
    "and retain valuable customers, ultimately contributing to sustained growth and profitability."
)

# Save the document
report_filename = "Customer_Churn_Prediction_Report.docx"
doc.save(report_filename)

print(f"\nReport '{report_filename}' created successfully.")

# Clean up generated image files
if os.path.exists(conf_matrix_path):
    os.remove(conf_matrix_path)
if os.path.exists(roc_curve_path):
    os.remove(roc_curve_path)


1. Loading and merging data...
Error loading file: [Errno 2] No such file or directory: 'Customer_Churn_Data_Large.xlsx - Customer_Demographics.csv'. Please ensure all CSV files are in the same directory.
Creating dummy data for demonstration purposes.
Data merging complete. Shape: (1000, 13)
Columns: ['CustomerID', 'Age', 'Gender', 'IncomeLevel', 'Education', 'ChurnStatus', 'TotalTransactionAmount', 'TotalTransactionCount', 'AvgCallDuration', 'AvgResolutionTime', 'ServiceCallCount', 'TotalPageViews', 'TotalLoginFrequency']

2. Preprocessing data...
Data preprocessing setup complete.

3. Building and training the model...
Data split: Train set size = 800, Test set size = 200
Starting Randomized Search for hyperparameter tuning...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found: {'classifier__max_depth': 6, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 107}
Best RO

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.7650
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
ROC-AUC: 0.4980

Confusion Matrix:
 [[153   0]
 [ 47   0]]
Model evaluation complete. Plots saved.

5. Generating Word report...

Report 'Customer_Churn_Prediction_Report.docx' created successfully.


<Figure size 800x600 with 0 Axes>