# Customer Segmentation Analysis
Exploratory Data Analysis, Clustering experiments, and Persona drafting.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Load data
df = pd.read_csv('../data/customer_transactions.csv')
df.head()

## Data Overview

In [None]:
df.info()
df.describe(include='all')

## Visualize Distributions

In [None]:
sns.histplot(df['Age'], kde=True)
plt.title('Age Distribution')
plt.show()

## Preprocessing and Feature Engineering

In [None]:
numeric_features = ['Age', 'PurchaseFreq', 'TotalSpend']
categorical_features = ['Region']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X = preprocessor.fit_transform(df)

## KMeans Clustering with silhouette score evaluation

In [None]:
scores = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    scores.append((k, score))
scores

## Select k and fit KMeans

In [None]:
k_optimal = max(scores, key=lambda x: x[1])[0]
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

## Dimensionality Reduction for Visualization

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(X)
df['PCA1'] = components[:, 0]
df['PCA2'] = components[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='tab10')
plt.title('Customer Segments (PCA Reduced)')
plt.show()

## Cluster Summary & Persona Sketch
- Age range
- Average Purchase Frequency
- Average Total Spend
- Dominant Region

Use this info to draft personas manually or to automate in the app.

In [None]:
cluster_summary = df.groupby('Cluster').agg({
    'Age': ['min', 'max', 'mean'],
    'PurchaseFreq': 'mean',
    'TotalSpend': 'mean',
    'Region': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'
})
cluster_summary.columns = ['Age Min', 'Age Max', 'Age Mean', 'Avg Purchase Frequency', 'Avg Total Spend', 'Dominant Region']
cluster_summary.reset_index(inplace=True)
cluster_summary