# 1. Dataset Construction

In [None]:
import pandas as pd
# Load the data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

#Remove the specified columns
columns_to_remove = ['MonthlyCharges', 'OnlineSecurity', 'StreamingTV', 'InternetService', 'Partner']
df = df.drop(columns=columns_to_remove)
# Verify the shape of the dataset
print(f"Dataset shape: {df.shape}")

# Save the modified dataset
df.to_csv('modified_Telco_Customer_Churn.csv', index=False)

# 2. Model Development

## 2.1 Problem Statement
### Context 
Big Retail, an online retail company in Adelaide, Australia is encounter declining visitor numbers and low conversion rates. To address these challenges, the company is pivoting towards a data-driven strategy, starting with an analysis of customer churn. 
### The Problem 
Customer churn, or customer attrition, is a critical issue for the retail industry. High churn rates can significantly impact the revenue of a company,especially Big Retail. 
The company is experiencing a decline in visitors to its website and conversion rates are low, indication issues with customer retention. 
### Solution 
- Develop a predictive model to identify customers at high risk of churning 
- Uncover key factors contributing to customer churn
- Provide actionable insights to improve customer retention strategies 
### Approach 
- Analyze historical customer data to identify patterns and predictors of churn
- Develop and evaluate a machine learning model (decision tree) to predict customer churn
- Interpret the model results to provide actionable business insights



## 2.2 Exploratory Data Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('modified_Telco_Customer_Churn.csv')

# Basic statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

In [None]:
print(df['TotalCharges'].describe())
print(df['Churn'].value_counts(normalize=True))
print(df.dtypes)

In [None]:
# Separate numeric and categorical columns
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

print("Numeric columns:", numeric_columns)
print("Categorical columns:", categorical_columns)

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df[numeric_columns].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Numeric Features')
plt.show()

In [None]:
# Distribution of categorical variables
fig, axs = plt.subplots(len(categorical_columns), 1, figsize=(15, 5*len(categorical_columns)))
for i, col in enumerate(categorical_columns):
    sns.countplot(x=col, data=df, ax=axs[i] if len(categorical_columns) > 1 else axs)
    axs[i].set_title(f'Distribution of {col}') if len(categorical_columns) > 1 else axs.set_title(f'Distribution of {col}')
    axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=45) if len(categorical_columns) > 1 else axs.set_xticklabels(axs.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of numeric variables
fig, axs = plt.subplots(len(numeric_columns), 1, figsize=(15, 5*len(numeric_columns)))
for i, col in enumerate(numeric_columns):
    sns.histplot(df[col], kde=True, ax=axs[i] if len(numeric_columns) > 1 else axs)
    axs[i].set_title(f'Distribution of {col}') if len(numeric_columns) > 1 else axs.set_title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Churn rate analysis
churn_rate = df['Churn'].value_counts(normalize=True)
plt.figure(figsize=(8, 6))
churn_rate.plot(kind='bar')
plt.title('Churn Rate')
plt.ylabel('Percentage')
plt.show()

In [None]:
# Analyze churn by tenure
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn', y='tenure', data=df)
plt.title('Customer Churn by Tenure')
plt.show()

## 2.3 Data Cleaning & Feature Selection


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Handle missing values in TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print("Missing values before imputation:", df['TotalCharges'].isna().sum())

imputer = SimpleImputer(strategy='mean')
df['TotalCharges'] = imputer.fit_transform(df[['TotalCharges']])
print("Missing values after imputation:", df['TotalCharges'].isna().sum())

# Save original TotalCharges
df['TotalCharges_Original'] = df['TotalCharges'].copy()

# Identify numerical and categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop(['TotalCharges_Original'])

# Scale numerical features
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Create binary target variable
df['Churn_Binary'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Encode categorical variables
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Prepare features and target
X = df.drop(['Churn', 'Churn_Binary', 'TotalCharges'], axis=1)
y = df['Churn_Binary']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

## 2.4 Model Building 

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, roc_auc_score, confusion_matrix, 
                           classification_report, precision_recall_curve, 
                           average_precision_score, roc_curve)
import matplotlib.pyplot as plt
import seaborn as sns

def train_and_evaluate_model(model, params, X_train, X_test, y_train, y_test):
    """
    Train and evaluate a machine learning model using hyperparameter tuning and present results.

    Parameters:
    - model: The machine learning model to be trained (e.g., DecisionTreeClassifier, RandomForestClassifier)
    - params: Dictionary of hyperparameters for GridSearchCV
    - X_train, X_test: Training and testing data features
    - y_train, y_test: Training and testing data labels

    Returns:
    - best_model: The best model after hyperparameter tuning
    - feature_importance: DataFrame containing feature importance (if applicable)
    - results: Dictionary containing evaluation metrics for the model
    """
    # Hyperparameter tuning with GridSearchCV
    grid_search = GridSearchCV(model, params, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("\nBest Parameters:", grid_search.best_params_)
    print("Best Cross-Validation Score:", grid_search.best_score_)
    best_model = grid_search.best_estimator_

    # Feature importance analysis (only if the model has the attribute 'feature_importances_')
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': best_model.feature_importances_
        }).sort_values('Importance', ascending=False)
    else:
        feature_importance = None

    # Model evaluation
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # Collect evaluation metrics
    results = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'AUC-ROC': roc_auc_score(y_test, y_pred_proba),
        'Average Precision': average_precision_score(y_test, y_pred_proba),
        'Confusion Matrix': confusion_matrix(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred)
    }

    # Print evaluation metrics
    print("\nModel Evaluation Results:")
    print(f"Accuracy: {results['Accuracy']:.3f}")
    print(f"AUC-ROC: {results['AUC-ROC']:.3f}")
    print(f"Average Precision: {results['Average Precision']:.3f}")
    print("\nConfusion Matrix:\n", results['Confusion Matrix'])
    print("\nClassification Report:\n", results['Classification Report'])
      
    return best_model, feature_importance, results

In [None]:

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_params = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# Models
models =[("Decision Tree", dt_model, dt_params), ("Random Forest", rf_model, rf_params)]
training_results = []
for name,model, params in models:
    print(f"\nTraining and Evaluating {name} Model:")
    best_model, feature_importance, results = train_and_evaluate_model(
        model, params, X_train, X_test, y_train, y_test
    )
    training_results.append((name, best_model, feature_importance, results))


## 2.5 Feature Importance and Model Interpretation

In [None]:

for name, best_model, feature_importance_, results in training_results:
    # Plot feature importance
    if feature_importance_ is not None:
        print(f"\n{name} - Top 10 Feature Importance:")
        print(feature_importance_.head(10))
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_)
        plt.title(f'Feature Importance ({name})')
        plt.show()



## 2.6 Business Insights

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_churn_patterns(df, feature_importance, n_top_features=5):
    """
    Analyze churn patterns using top features from the model
    
    Parameters:
    df: DataFrame with original data (before encoding)
    feature_importance: DataFrame with feature importance from the model
    n_top_features: Number of top features to analyze
    """
    # Select top features
    top_features = feature_importance.head(n_top_features)['Feature'].tolist()
    
    # Initialize figure for plotting
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    # Analyze each feature
    print("Churn Rate Analysis by Top Features:")
    churn_analysis = {}
    
    for idx, feature in enumerate(top_features):
        # Calculate churn rate and count for each category
        analysis = df.groupby(feature).agg({
            'Churn_Binary': ['mean', 'count', 'sum']
        }).round(3)
        
        analysis.columns = ['Churn_Rate', 'Total_Customers', 'Churned_Customers']
        analysis = analysis.sort_values('Churn_Rate', ascending=False)
        
        churn_analysis[feature] = analysis
        
        # Print analysis
        print(f"\n{feature}:")
        print(analysis)
        
        # Plot
        sns.barplot(
            data=df,
            x=feature,
            y='Churn_Binary',
            ax=axes[idx],
            ci=None
        )
        axes[idx].set_title(f'Churn Rate by {feature}')
        axes[idx].set_xlabel('')
        axes[idx].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Analyze high-risk combinations
    print("\nAnalyzing High-Risk Customer Segments...")
    
    # Use top 3 features for combinations to keep it manageable
    top_3_features = top_features[:3]
    
    high_risk_segments = df.groupby(top_3_features).agg({
        'Churn_Binary': ['mean', 'count', 'sum']
    }).round(3)
    
    high_risk_segments.columns = ['Churn_Rate', 'Total_Customers', 'Churned_Customers']
    high_risk_segments = high_risk_segments[high_risk_segments['Total_Customers'] >= 20]  # Filter for segments with enough customers
    high_risk_segments = high_risk_segments.sort_values('Churn_Rate', ascending=False)
    
    print("\nTop 10 High-Risk Customer Segments (minimum 20 customers):")
    print(high_risk_segments.head(10))
    
    # Calculate overall statistics
    total_customers = len(df)
    total_churned = df['Churn_Binary'].sum()
    overall_churn_rate = total_churned / total_customers
    
    print(f"\nOverall Statistics:")
    print(f"Total Customers: {total_customers}")
    print(f"Total Churned Customers: {int(total_churned)}")
    print(f"Overall Churn Rate: {overall_churn_rate:.2%}")
    
    return churn_analysis, high_risk_segments

In [None]:
for name, best_model, feature_importance_, results in training_results:
    churn_analysis, high_risk_segments = analyze_churn_patterns(df, feature_importance_)
    


In [None]:
for name, best_model, feature_importance_, results in training_results:
# Identify top churning customer segments
    print(f"\n***{name} Model:")
    top_churn_features = feature_importance_.head(5)['Feature'].tolist()
    churn_segments = df.groupby(top_churn_features)['Churn'].mean().sort_values(ascending=False).head(10)

    print("Top Churning Customer Segments:")
    print(churn_segments)

    # Calculate potential revenue saved by reducing churn
    avg_customer_value = df['TotalCharges_Original'].mean()
    print(f"\nAverage Customer Value: ${avg_customer_value:.2f}")
    current_churn_rate = df['Churn_Binary'].mean()
    print(f"Current Churn Rate: {current_churn_rate:.2f}")
    potential_churn_reduction = 0.1 # Assume we can reduce churn by 10%

    potential_savings = avg_customer_value * len(df) * current_churn_rate * potential_churn_reduction
    print(f"\nPotential Annual Revenue Saved: ${potential_savings:.2f}")
