<a href="https://colab.research.google.com/github/ArezooNajafi/Customer-Churn-in-Online-Retail/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hassaneskikri/online-retail-customer-churn-dataset")

print("Path to dataset files:", path)

In [None]:
import os

# List all files inside the downloaded dataset folder
files = os.listdir(path)
print(files)


In [None]:
import pandas as pd

# Full path to the CSV file
csv_file_path = os.path.join(path, "online_retail_customer_churn.csv")

# Read it into a DataFrame
df = pd.read_csv(csv_file_path)

# Show first 5 rows
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all') # Use include='all' to include all columns

In [None]:
df['APV'] = df['Total_Spend'] / df['Num_of_Purchases']
df['Purchase_Frequency'] = df['Num_of_Purchases'] / df['Years_as_Customer']
df['CLV'] = df['APV'] * df['Purchase_Frequency'] * df['Years_as_Customer']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math



In [None]:
# Replace churn values with True/False
df['Target_Churn'] = df['Target_Churn'].replace({1: True, 0: False})

# Define colors: red for churn, green for not churn
colors = ['red', 'green']

# Plot the pie chart
plt.pie(
    df['Target_Churn'].value_counts(),
    labels=['Churn', 'Not Churn'],
    autopct='%1.1f%%',
    colors=colors
)
plt.title('Churn Distribution')
plt.show()


In [None]:
plt.pie(df['Gender'].value_counts(), labels=df['Gender'].unique(), autopct='%1.1f%%')
plt.title('Gender Distribution')
plt.show()

In [None]:
plt.pie(df['Promotion_Response'].value_counts(), labels=df['Promotion_Response'].unique(), autopct='%1.1f%%')
plt.title('Promotion Response Distribution')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.set(style="whitegrid")

# Improved histogram
sns.histplot(df['Age'],
             bins=range(10, 85, 5),
             kde=True,
             color=sns.color_palette("coolwarm", 8)[3],
             edgecolor='black',
             alpha=0.8)

plt.title('Customer Age Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Age', fontsize=12)
plt.ylabel('Customer Count', fontsize=12)
plt.xticks(range(10, 85, 5))
plt.grid(axis='y', linestyle='--', linewidth=0.7, alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Seaborn theme
sns.set_theme(style="whitegrid")

# Define 5-year age bins
bins_5 = range(10, 81, 5)
labels_5 = [f"{i}-{i+4}" for i in bins_5[:-1]]
df['Age_Group_5'] = pd.cut(df['Age'], bins=bins_5, labels=labels_5, right=False)

# Group data (preserve age order)
age_group_data = df.groupby('Age_Group_5', observed=False)['Total_Spend'].sum().reset_index()

# Create ordered categorical type to preserve label order
age_group_data['Age_Group_5'] = pd.Categorical(
    age_group_data['Age_Group_5'], categories=labels_5, ordered=True
)

# Sort by age group order (not value)
age_group_data = age_group_data.sort_values('Age_Group_5')

# Plot
plt.figure(figsize=(12, 6))
barplot = sns.barplot(
    data=age_group_data,
    x='Age_Group_5',
    y='Total_Spend',
    palette='mako'
)

# Add value labels
for index, row in age_group_data.iterrows():
    barplot.text(index, row.Total_Spend + 100, f"${row.Total_Spend:,.0f}",
                 ha='center', fontsize=10, weight='bold')

# Formatting
plt.title("Total Spend by Age Groups", fontsize=16, weight='bold')
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Total Spend ($)", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:


# Group by both Age Group and Gender
age_gender_group_data = df.groupby(['Age_Group_5', 'Gender'], observed=False)['Total_Spend'].sum().reset_index()

# Create ordered categorical type to preserve label order
age_gender_group_data['Age_Group_5'] = pd.Categorical(
    age_gender_group_data['Age_Group_5'], categories=labels_5, ordered=True
)

# Sort by age group order (not value)
age_gender_group_data = age_gender_group_data.sort_values('Age_Group_5')

# Plot
plt.figure(figsize=(12, 6))
barplot = sns.barplot(
    data=age_gender_group_data,
    x='Age_Group_5',
    y='Total_Spend',
    hue='Gender',  # Differentiate by gender
    palette='mako'
)


# Formatting
plt.title("Total Spend by Age Groups with Gender Labels", fontsize=16, weight='bold')
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Total Spend ($)", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Set better visual theme
sns.set_theme(style="whitegrid")

# Group data by Gender
gender_spend = df.groupby('Gender', observed=False)['Total_Spend'].sum().reset_index()

# Sort genders by total spend (optional)
gender_spend = gender_spend.sort_values('Total_Spend', ascending=False)

# Use a more appealing palette (e.g., "Set2", "colorblind", or custom)
custom_colors = ["#5E60CE", "#5390D9", "#4EA8DE"]  # Purple-blue gradient

# Create the plot
plt.figure(figsize=(8, 6))
barplot = sns.barplot(
    data=gender_spend,
    x='Gender',
    y='Total_Spend',
    palette=custom_colors
)

# Add labels on top of bars
for index, row in gender_spend.iterrows():
    barplot.text(index, row.Total_Spend + 50000, f"${row.Total_Spend:,.0f}",
                 ha='center', fontsize=10, weight='bold')

# Formatting
plt.title("Total Spend by Gender", fontsize=16, weight='bold')
plt.xlabel("Gender", fontsize=12)
plt.ylabel("Total Spend ($)", fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
df['Return_Rate'] = df['Num_of_Returns'] / (df['Num_of_Purchases'] + 1e-5)

plt.figure(figsize=(7, 5))
sns.barplot(x='Gender', y='Return_Rate', data=df, ci=None, palette='pastel')
plt.title('Average Return Rate by Gender')
plt.ylabel('Return Rate')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.set(style="whitegrid")

# Histogram + KDE with custom palette
sns.histplot(df['Annual_Income'],
             bins=range(20, 220, 20),
             kde=True,
             color=sns.color_palette("viridis", as_cmap=True)(0.6),
             edgecolor='black')

plt.title('Annual Income Distribution', fontsize=16)
plt.xlabel('Annual Income ($K)', fontsize=12)
plt.ylabel('Customer Count', fontsize=12)
plt.xticks(range(20, 220, 20))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:


# Group by both Age Group and Gender
age_gender_group_data = df.groupby(['Age_Group_5', 'Gender'], observed=False)['Purchase_Frequency'].mean().reset_index()

# Create ordered categorical type to preserve label order
age_gender_group_data['Age_Group_5'] = pd.Categorical(
    age_gender_group_data['Age_Group_5'], categories=labels_5, ordered=True
)

sns.set_theme(style="white")

# Sort by age group order (not value)
age_gender_group_data = age_gender_group_data.sort_values('Age_Group_5')

# Plot
plt.figure(figsize=(12, 6))
barplot = sns.barplot(
    data=age_gender_group_data,
    x='Age_Group_5',
    y='Purchase_Frequency',
    hue='Gender',  # Differentiate by gender
    palette='mako'
)


# Formatting
plt.title("Avg. Purchases per Year (Loyalty) Age Groups with Gender Labels", fontsize=16, weight='bold')
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Total Spend ($)", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Assuming df is your DataFrame with 'Age_Group_5', 'Years_as_Customer', and 'Gender'

# Set up subplot grid size
fig, axes = plt.subplots(1, 2, figsize=(16, 6))  # 1 row, 2 columns

# Plot 1: Average Years as Customer by Age Group
sns.set(style="whitegrid")
ax1 = sns.barplot(x='Age_Group_5', y='Years_as_Customer', data=df, estimator='mean', ci=None, palette='Blues', ax=axes[0])

# Add title and labels for Age Group plot
axes[0].set_title('Average Years as Customer by Age Group', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Age Group', fontsize=12)
axes[0].set_ylabel('Average Years as Customer', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)  # Rotate x-axis labels for readability

# Add data labels to bars
for p in ax1.patches:
    ax1.annotate(f'{p.get_height():.2f}',
                 (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center',
                 fontsize=12, color='black',
                 xytext=(0, 5), textcoords='offset points')

# Plot 2: Average Years as Customer by Gender
ax2 = sns.barplot(x='Gender', y='Years_as_Customer', data=df, estimator='mean', ci=None, palette='coolwarm', ax=axes[1])

# Add title and labels for Gender plot
axes[1].set_title('Average Years as Customer by Gender', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Gender', fontsize=12)
axes[1].set_ylabel('Average Years as Customer', fontsize=12)

# Add data labels to bars
for p in ax2.patches:
    ax2.annotate(f'{p.get_height():.2f}',
                 (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center',
                 fontsize=12, color='black',
                 xytext=(0, 5), textcoords='offset points')

# Adjust layout for better spacing
plt.tight_layout()

# Show plot
plt.show()

In [None]:
# Ensure Return_Rate and Age_Group_5 exist
df['Return_Rate'] = df['Num_of_Returns'] / (df['Num_of_Purchases'] + 1e-5)

# Set up the plotting grid
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.set(style="whitegrid")

# Plot 1: Return Rate by Age Group with a bright color palette
ax1 = sns.barplot(x='Age_Group_5', y='Return_Rate', data=df,
                  estimator='mean', ci=None,
                  palette=sns.color_palette("crest", as_cmap=False), ax=axes[0])

axes[0].set_title('Average Return Rate by Age Group', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Age Group', fontsize=12)
axes[0].set_ylabel('Avg Return Rate', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)

# Add labels
for p in ax1.patches:
    ax1.annotate(f'{p.get_height():.2f}',
                 (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='bottom',
                 fontsize=11, color='black',
                 xytext=(0, 5), textcoords='offset points')

# Plot 2: Return Rate by Gender with a contrasting, vivid palette
ax2 = sns.barplot(x='Gender', y='Return_Rate', data=df,
                  estimator='mean', ci=None,
                  palette=sns.color_palette("flare", as_cmap=False), ax=axes[1])

axes[1].set_title('Average Return Rate by Gender', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Gender', fontsize=12)
axes[1].set_ylabel('Avg Return Rate', fontsize=12)

# Add labels
for p in ax2.patches:
    ax2.annotate(f'{p.get_height():.2f}',
                 (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='bottom',
                 fontsize=11, color='black',
                 xytext=(0, 5), textcoords='offset points')

# Final layout tweaks
plt.tight_layout()
plt.show()


In [None]:
# Group and calculate total returns and purchases
grouped = df.groupby(['Age_Group_5', 'Gender'], observed=False).agg({
    'Num_of_Returns': 'sum',
    'Num_of_Purchases': 'sum'
}).reset_index()

# Calculate return rate safely
grouped['Return_Rate'] = grouped['Num_of_Returns'] / (grouped['Num_of_Purchases'] + 1e-5)

# Ensure consistent age group ordering if labels_5 exists
if 'labels_5' in locals():
    grouped['Age_Group_5'] = pd.Categorical(grouped['Age_Group_5'], categories=labels_5, ordered=True)

# Sort for better visualization
grouped = grouped.sort_values('Age_Group_5')

# Plot
plt.figure(figsize=(12, 6))
sns.set_theme(style="whitegrid")
ax = sns.barplot(
    data=grouped,
    x='Age_Group_5',
    y='Return_Rate',
    hue='Gender',
    palette='viridis'
)

# Formatting
plt.title("Return Rate by Age Group and Gender", fontsize=16, weight='bold')
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Return Rate", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plot count of promotion response by Age Group
plt.figure(figsize=(10, 6))
sns.set_theme(style="whitegrid")

ax = sns.countplot(data=df, x='Age_Group_5', hue='Promotion_Response', palette='pastel')

plt.title('Promotion Response Counts by Age Group', fontsize=16, fontweight='bold')
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Count', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.set_theme(style="whitegrid")

ax2 = sns.countplot(data=df, x='Gender', hue='Promotion_Response', palette='muted')

plt.title('Promotion Response Counts by Gender', fontsize=16, fontweight='bold')
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add data labels on bars
for p in ax2.patches:
    height = p.get_height()
    ax2.annotate(f'{height}', (p.get_x() + p.get_width() / 2., height),
                 ha='center', va='bottom', fontsize=11, xytext=(0, 3), textcoords='offset points')

plt.tight_layout()
plt.show()

In [None]:
# Example DataFrame: df with 'Num_of_Returns' and 'Target_Churn'

# Step 1: Create bins for number of returns
max_returns = df['Num_of_Returns'].max()
bins = [-1, 0, 2, max_returns]  # -1 so 0 is included in first bin
labels = ['0 Returns', '1-2 Returns', f'3+ Returns']

df['Returns_Bin'] = pd.cut(df['Num_of_Returns'], bins=bins, labels=labels)

# Step 2: Calculate churn rate by returns bin
churn_by_returns_bin = df.groupby('Returns_Bin')['Target_Churn'].mean().reset_index()
churn_by_returns_bin['Churn_Rate'] = churn_by_returns_bin['Target_Churn'] * 100  # to percent

# Step 3: Plot churn rate by returns bin
plt.figure(figsize=(10,6))
sns.set_theme(style="whitegrid")

ax = sns.barplot(data=churn_by_returns_bin, x='Returns_Bin', y='Churn_Rate', palette='viridis')

plt.title('Customer Churn Rate by Number of Returns (Binned)', fontsize=16, fontweight='bold')
plt.xlabel('Number of Returns', fontsize=12)
plt.ylabel('Churn Rate (%)', fontsize=12)

# Add data labels on top of bars
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height:.1f}%', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=11, xytext=(0, 3), textcoords='offset points')

plt.tight_layout()
plt.show()

In [None]:
# Group by both Age Group and Gender, calculate mean of Years_as_Customer
age_gender_group_data = df.groupby(['Age_Group_5', 'Gender'], observed=False)['Years_as_Customer'].mean().reset_index()

# Create ordered categorical type to preserve label order
age_gender_group_data['Age_Group_5'] = pd.Categorical(
    age_gender_group_data['Age_Group_5'], categories=labels_5, ordered=True
)

sns.set_theme(style="white")

# Sort by age group order (not value)
age_gender_group_data = age_gender_group_data.sort_values('Age_Group_5')

# Plot, change y to 'Years_as_Customer'
plt.figure(figsize=(12, 6))
barplot = sns.barplot(
    data=age_gender_group_data,
    x='Age_Group_5',
    y='Years_as_Customer',  # Changed to 'Years_as_Customer'
    hue='Gender',  # Differentiate by gender
    palette='mako'
)


# Formatting
plt.title("Average Years as Customer by Age Groups with Gender Labels", fontsize=16, weight='bold')
plt.xlabel("Age Group", fontsize=12)
plt.ylabel("Years_as_Customer", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:


plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")

# Bar plot of mean income per gender, without the confidence interval (no vertical line)
ax = sns.barplot(x='Gender', y='Annual_Income', data=df, estimator='mean', ci=None, palette='coolwarm')

# Add data labels on top of the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center',
                fontsize=12, color='black',
                xytext=(0, 5), textcoords='offset points')

# Enhancements
plt.title('Average Annual Income by Gender', fontsize=14, fontweight='bold')
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Average Income ($)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
# --- Step 1: Grouped Data ---
# Satisfaction by Age Group
satisfaction_age = df.groupby('Age_Group_5', observed=False)['Satisfaction_Score'].mean().reset_index()

# Satisfaction by Gender
satisfaction_gender = df.groupby('Gender')['Satisfaction_Score'].mean().reset_index()

# --- Step 2: Plotting ---
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# --- Subplot 1: Satisfaction by Age Group ---
sns.barplot(data=satisfaction_age, x='Age_Group_5', y='Satisfaction_Score', palette='viridis', ax=axes[0])
axes[0].set_title("Avg. Satisfaction Score by Age Group", fontsize=14, weight='bold')
axes[0].set_xlabel("Age Group")
axes[0].set_ylabel("Satisfaction Score")
axes[0].tick_params(axis='x', rotation=30)

# Add data labels
for bar in axes[0].patches:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width() / 2, height + 0.1, f"{height:.1f}", ha='center', fontsize=10)

# --- Subplot 2: Satisfaction by Gender ---
sns.barplot(data=satisfaction_gender, x='Gender', y='Satisfaction_Score', palette='pastel', ax=axes[1])
axes[1].set_title("Avg. Satisfaction Score by Gender", fontsize=14, weight='bold')
axes[1].set_xlabel("Gender")
axes[1].set_ylabel("Satisfaction Score")

# Add data labels
for bar in axes[1].patches:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width() / 2, height + 0.1, f"{height:.1f}", ha='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
df['Purchases_per_Year'] = df['Num_of_Purchases'] / df['Years_as_Customer']

In [None]:
df['Target_Churn'] = df['Target_Churn'].astype(int)

# --- Churn Rate by Age Group ---
age_churn = df.groupby('Age_Group_5', observed=False)['Target_Churn'].mean().reset_index()
age_churn['Churn_Rate'] = age_churn['Target_Churn'] * 100

# --- Churn Rate by Gender ---
gender_churn = df.groupby('Gender')['Target_Churn'].mean().reset_index()
gender_churn['Churn_Rate'] = gender_churn['Target_Churn'] * 100

# --- Plotting ---
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# --- Subplot 1: Churn by Age Group ---
sns.barplot(data=age_churn, x='Age_Group_5', y='Churn_Rate', palette='coolwarm', ax=axes[0])
axes[0].set_title("Churn Rate by Age Group", fontsize=14, weight='bold')
axes[0].set_xlabel("Age Group")
axes[0].set_ylabel("Churn Rate (%)")
axes[0].tick_params(axis='x', rotation=30)

# Add data labels
for bar in axes[0].patches:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width() / 2, height + 1, f"{height:.1f}%", ha='center', fontsize=10)

# --- Subplot 2: Churn by Gender ---
sns.barplot(data=gender_churn, x='Gender', y='Churn_Rate', palette='Set2', ax=axes[1])
axes[1].set_title("Churn Rate by Gender", fontsize=14, weight='bold')
axes[1].set_xlabel("Gender")
axes[1].set_ylabel("Churn Rate (%)")

# Add data labels
for bar in axes[1].patches:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width() / 2, height + 1, f"{height:.1f}%", ha='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Group by Age Group and Gender
age_gender_churn = df.groupby(['Age_Group_5', 'Gender'], observed=False)['Target_Churn'].mean().reset_index()
age_gender_churn['Churn_Rate'] = age_gender_churn['Target_Churn'] * 100

# Set custom colors to match previous gender chart
gender_palette = {
    'Female': '#86b5a9',  # same as your greenish color
    'Male': '#da9167',    # same as your orangish color
    'Other': '#949dc1'    # same as your purplish color
}

# Plotting
sns.set_theme(style="whitegrid")
plt.figure(figsize=(14, 6))

# Grouped barplot with specified color palette
sns.barplot(
    data=age_gender_churn,
    x='Age_Group_5',
    y='Churn_Rate',
    hue='Gender',
    palette=gender_palette
)

# Customizations
plt.title("Churn Rate by Age Group and Gender", fontsize=14, weight='bold')
plt.xlabel("Age Group")
plt.ylabel("Churn Rate (%)")
plt.xticks(rotation=30)

# Add data labels
for container in plt.gca().containers:
    plt.bar_label(container, fmt='%.1f%%', label_type='edge', fontsize=9)

plt.legend(title="Gender")
plt.tight_layout()
plt.show()

In [None]:
satisfaction_churn = df.groupby('Target_Churn')['Satisfaction_Score'].mean().reset_index()
satisfaction_churn['Churn_Label'] = satisfaction_churn['Target_Churn'].map({0: 'No Churn', 1: 'Churn'})

# Define manual colors: green for No Churn, red for Churn
custom_palette = ['green', 'red']  # Order: [No Churn, Churn]

# --- Step 2: Plotting ---
sns.set_theme(style="whitegrid")
plt.figure(figsize=(8, 6))

ax = sns.barplot(data=satisfaction_churn, x='Churn_Label', y='Satisfaction_Score', palette=custom_palette)
plt.title("😊 Avg. Satisfaction Score by Churn Status", fontsize=14, weight='bold')
plt.xlabel("Churn Status")
plt.ylabel("Avg. Satisfaction Score")

# Add data labels
for bar in ax.patches:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height + 0.1, f"{height:.1f}", ha='center', fontsize=11)

plt.tight_layout()
plt.show()

In [None]:
income_bins = [0, 40, 80, 120, df['Annual_Income'].max() + 1]
income_labels = ['Low', 'Medium', 'High', 'Very High']
df['Income_Group'] = pd.cut(df['Annual_Income'], bins=income_bins, labels=income_labels, right=False)

# --- Step 2A: Satisfaction by Income Group ---
satisfaction_by_income = df.groupby('Income_Group', observed=False)['Satisfaction_Score'].mean().reset_index()

# --- Step 2B: Churn Rate by Income Group ---
churn_by_income = df.groupby('Income_Group', observed=False)['Target_Churn'].mean().reset_index()
churn_by_income['Churn_Rate'] = churn_by_income['Target_Churn'] * 100  # percentage

# --- Step 3: Plotting ---
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot A: Satisfaction Score
sns.barplot(data=satisfaction_by_income, x='Income_Group', y='Satisfaction_Score', palette='Blues', ax=axes[0])
axes[0].set_title("Avg. Satisfaction Score by Income Group", fontsize=14, weight='bold')
axes[0].set_ylabel("Avg. Satisfaction Score")
axes[0].set_xlabel("Income Group")

# Add data labels
for bar in axes[0].patches:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2, height + 0.1, f"{height:.1f}", ha='center')

# Plot B: Churn Rate
sns.barplot(data=churn_by_income, x='Income_Group', y='Churn_Rate', palette='Reds', ax=axes[1])
axes[1].set_title("Churn Rate by Income Group", fontsize=14, weight='bold')
axes[1].set_ylabel("Churn Rate (%)")
axes[1].set_xlabel("Income Group")

# Add data labels
for bar in axes[1].patches:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2, height + 0.5, f"{height:.1f}%", ha='center')

plt.tight_layout()
plt.show()

In [None]:
# Group by Income Group and Gender, calculate mean Total Spend
income_spend = df.groupby(['Income_Group', 'Gender'], observed=False)['Total_Spend'].mean().reset_index()

# Set up the plot
plt.figure(figsize=(12, 6))
sns.barplot(data=income_spend, x='Income_Group', y='Total_Spend', hue='Gender', palette='viridis')

# Add labels on each bar
for container in plt.gca().containers:
    plt.bar_label(container, fmt='${:.2f}', fontsize=10)

# Customize the plot
plt.title("Average Total Spend by Income Group and Gender", fontsize=14, weight='bold')
plt.xlabel("Income Group", fontsize=12)
plt.ylabel("Average Total Spend ($)", fontsize=12)
plt.tight_layout()

# Show plot
plt.show()


In [None]:
# Map boolean to descriptive labels
df['Email_Opt_In_Label'] = df['Email_Opt_In'].replace({True: 'Opted In', False: 'Not Opted In'})

# Group and calculate churn rate
email_churn = df.groupby('Email_Opt_In_Label', observed=False)['Target_Churn'].mean().reset_index()
email_churn['Churn_Rate'] = email_churn['Target_Churn'] * 100

# Plot
plt.figure(figsize=(6, 5))
ax = sns.barplot(data=email_churn, x='Email_Opt_In_Label', y='Churn_Rate', palette='pastel')
plt.title("Churn Rate by Email Opt-In", fontsize=14, weight='bold')
plt.ylabel("Churn Rate (%)")
plt.xlabel("Email Opt-In Status")

# Add data labels
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height:.1f}%',
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=10, color='black')

plt.tight_layout()
plt.show()


In [None]:
# Create income bins
df['Income_Group'] = pd.cut(df['Annual_Income'], bins=[0, 30, 60, 90, 120, 200],
                            labels=['<30k', '30-60k', '60-90k', '90-120k', '120k+'])

# Churn rate by income group
income_churn = df.groupby('Income_Group', observed=False)['Target_Churn'].mean().reset_index()
income_churn['Churn_Rate'] = income_churn['Target_Churn'] * 100

# Plot
plt.figure(figsize=(10, 5))
ax = sns.barplot(data=income_churn, x='Income_Group', y='Churn_Rate', palette='viridis')
plt.title("Churn Rate by Income Group", fontsize=14, weight='bold')
plt.ylabel("Churn Rate (%)")
plt.xlabel("Income Group")

# Add labels manually
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height:.1f}%',
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=10, color='black')

plt.tight_layout()
plt.show()


In [None]:
# Group years of customer into bins
df['Years_Group'] = pd.cut(df['Years_as_Customer'], bins=[0, 3, 6, 10, 15, 20], labels=['0-3', '4-6', '7-10', '11-15', '16+'])

# Calculate churn rate by years group
years_churn = df.groupby('Years_Group', observed=False)['Target_Churn'].mean().reset_index()
years_churn['Churn_Rate'] = years_churn['Target_Churn'] * 100

# Plotting
plt.figure(figsize=(10, 5))
ax = sns.barplot(data=years_churn, x='Years_Group', y='Churn_Rate', palette='mako')
plt.title("Churn Rate by Years as Customer", fontsize=14, weight='bold')
plt.ylabel("Churn Rate (%)")
plt.xlabel("Years as Customer")

# Add value labels on bars
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height:.1f}%',
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=10, color='black')

plt.tight_layout()
plt.show()
