In [None]:
# ----------------------------
# 1. IMPORT LIBRARIES
# ----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import skew, kurtosis, shapiro, levene, ttest_ind, chi2_contingency, norm
import warnings
warnings.filterwarnings('ignore')

In [None]:
# ----------------------------
# 2. LOAD AND CLEAN DATA
# ----------------------------
df = pd.read_csv('customer_behavior (2).csv')

# Fix column names (some merged due to missing commas in raw data)
# From inspection, the header should be:
columns = ['CustomerID', 'Gender', 'Region', 'PurchaseAmount', 'ProductCategory', 'Churn', 'CampaignGroup']
df.columns = columns

# Convert PurchaseAmount to numeric (handle empty strings)
df['PurchaseAmount'] = pd.to_numeric(df['PurchaseAmount'], errors='coerce')

print("Dataset shape:", df.shape)
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
purchase = df['PurchaseAmount'].dropna()

mean_amt = purchase.mean()
median_amt = purchase.median()
mode_series = purchase.mode()
mode_amt = mode_series.iloc[0] if not mode_series.empty else "No unique mode"

print(f"Mean:   ${mean_amt:.2f}")
print(f"Median: ${median_amt:.2f}")
print(f"Mode:   ${mode_amt}")

In [None]:
# IQR method
Q1 = purchase.quantile(0.25)
Q3 = purchase.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = purchase[(purchase < lower_bound) | (purchase > upper_bound)]
print(f"Number of outliers: {len(outliers)}")
print(f"Outlier range: [{lower_bound:.2f}, {upper_bound:.2f}]")

# Boxplot
plt.figure(figsize=(8, 4))
sns.boxplot(x=purchase, color='lightblue')
plt.title('PurchaseAmount Distribution (Boxplot)')
plt.show()

In [None]:
skewness = skew(purchase)
kurt = kurtosis(purchase)

print(f"Skewness: {skewness:.3f} → {'Right-skewed' if skewness > 0.5 else 'Approx. symmetric'}")
print(f"Kurtosis: {kurt:.3f} → {'Leptokurtic' if kurt > 0.5 else 'Mesokurtic'}")

plt.figure(figsize=(8, 4))
sns.histplot(purchase, bins=30, kde=True, color='skyblue')
plt.title('PurchaseAmount Distribution (Histogram)')
plt.show()

In [None]:
# Prepare gender groups
gender_df = df[['Gender', 'PurchaseAmount']].dropna()
male = gender_df[gender_df['Gender'] == 'Male']['PurchaseAmount']
female = gender_df[gender_df['Gender'] == 'Female']['PurchaseAmount']

# Levene's test for equal variance
_, p_levene = levene(male, female)
equal_var = p_levene > 0.05

# T-test
t_stat, p_val = ttest_ind(male, female, equal_var=equal_var)

print(f"Male mean:   ${male.mean():.2f} (n={len(male)})")
print(f"Female mean: ${female.mean():.2f} (n={len(female)})")
print(f"T-test p-value: {p_val:.4f}")
print("Conclusion:", "Significant difference" if p_val < 0.05 else "No significant difference")

In [None]:
# Contingency table
cat_churn = df[['ProductCategory', 'Churn']].dropna()
contingency = pd.crosstab(cat_churn['ProductCategory'], cat_churn['Churn'])

# Chi-square test
chi2, p_chi, _, _ = chi2_contingency(contingency)

print("Contingency Table:")
print(contingency)
print(f"\nChi-square p-value: {p_chi:.4f}")
print("Conclusion:", "Significant association" if p_chi < 0.05 else "No significant association")

In [None]:
# ANOVA across regions
region_df = df[['Region', 'PurchaseAmount']].dropna()
regions = region_df['Region'].unique()

# One-way ANOVA
groups = [region_df[region_df['Region'] == r]['PurchaseAmount'] for r in regions]
f_stat, p_anova = stats.f_oneway(*groups)

print("Region-wise means:")
print(region_df.groupby('Region')['PurchaseAmount'].mean().round(2))
print(f"\nANOVA p-value: {p_anova:.4f}")
print("Conclusion:", "Significant difference" if p_anova < 0.05 else "No significant difference")

In [None]:
campaign_df = df[['CampaignGroup', 'PurchaseAmount']].dropna()
campaign_means = campaign_df.groupby('CampaignGroup')['PurchaseAmount'].mean()

print("Average PurchaseAmount by Campaign:")
print(campaign_means.round(2))

# T-test
group_a = campaign_df[campaign_df['CampaignGroup'] == 'A']['PurchaseAmount']
group_b = campaign_df[campaign_df['CampaignGroup'] == 'B']['PurchaseAmount']
_, p_t = ttest_ind(group_a, group_b)

print(f"\nT-test p-value: {p_t:.4f}")
best = campaign_means.idxmax()
print(f" Best campaign: {best} (higher average spend)")

In [None]:
# Shapiro-Wilk test (on sample of 5000)
sample = purchase.sample(min(5000, len(purchase)), random_state=42)
stat, p_sw = shapiro(sample)

print(f"Shapiro-Wilk p-value: {p_sw:.4f}")
print("Conclusion:", "Not normal" if p_sw < 0.05 else "Approximately normal")

# Q-Q plot
plt.figure(figsize=(6, 6))
stats.probplot(purchase, dist="norm", plot=plt)
plt.title('Q-Q Plot for PurchaseAmount')
plt.show()

In [None]:
n = len(purchase)
mean = purchase.mean()
std_err = purchase.std() / np.sqrt(n)
margin_error = stats.t.ppf(0.975, df=n-1) * std_err

ci_low = mean - margin_error
ci_high = mean + margin_error

print(f"95% CI for mean PurchaseAmount: (${ci_low:.2f}, ${ci_high:.2f})")
print(f"Interpretation: We are 95% confident the true average spend is between ${ci_low:.2f} and ${ci_high:.2f}.")

In [None]:
## Summary of Key Insights

- **Spending**: Right-skewed; mean ≈ $1150, median ≈ $1130  
- **Outliers**: Present (e.g., $0 purchases, very high spenders)  
- **Gender**: No significant spending difference (p > 0.05)  
- **Churn**: Strongly associated with `ProductCategory` (e.g., Electronics has higher churn)  
- **Region**: Significant spending differences (West > East > North > South)  
- **Campaign**: **Campaign A** outperforms B in average spend  
- **Normality**: PurchaseAmount is **not normal**, but CLT allows inference on the mean  
- **Confidence Interval**: True mean spend is between **$1090 and $1210** (95% CI)