### Importing Required Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

### Load Cleaned Data

In [3]:
df = pd.read_csv("/content/customer_segmentation_clean.csv")
df.head()

Unnamed: 0,minutes_watched,CLV,region,channel,is_free_user,is_zero_engagement
0,3197.0,205.42,2,4,0,0
1,63.0,149.99,0,1,0,0
2,605.0,119.0,2,2,0,0
3,20.0,240.0,2,4,0,0
4,245.0,184.36,1,4,0,0


In [4]:
print(f"Input Shape : {df.shape[0]} Rows, {df.shape[1]} Columns")

Input Shape : 3791 Rows, 6 Columns


In [5]:
print(f"Columns:\n {df.columns.to_list()}")

Columns:
 ['minutes_watched', 'CLV', 'region', 'channel', 'is_free_user', 'is_zero_engagement']


### LOG1P TRANSFORMATION

Both numerical features are right-skewed.\
KMeans uses Euclidean distance, scale and distribution shape matter.\
log1p (log(1+x)) is preferred over log(x) because it handles zero values gracefully without producing -inf.

In [6]:
df['log_minutes_watched'] = np.log1p(df['minutes_watched'])

In [7]:
df['log_clv'] = np.log1p(df['CLV'])

In [8]:
df.head()

Unnamed: 0,minutes_watched,CLV,region,channel,is_free_user,is_zero_engagement,log_minutes_watched,log_clv
0,3197.0,205.42,2,4,0,0,8.070281,5.329913
1,63.0,149.99,0,1,0,0,4.158883,5.017214
2,605.0,119.0,2,2,0,0,6.40688,4.787492
3,20.0,240.0,2,4,0,0,3.044522,5.484797
4,245.0,184.36,1,4,0,0,5.505332,5.2223


In [9]:
for raw, log in [('minutes_watched', 'log_minutes_watched'), ('CLV', 'log_clv')]:
    print(f"\n {raw}")
    print(f"Skew Before : {df[raw].skew():.2f}")
    print(f"Skew After : {df[log].skew():.2f}")
    print(f"Range Before : {df[raw].min()}, {df[raw].max()}")
    print(f"Range After : {df[log].min():.2f}, {df[log].max():.2f}")


 minutes_watched
Skew Before : 20.68
Skew After : -0.58
Range Before : 0.0, 288508.0
Range After : 0.00, 12.57

 CLV
Skew Before : 1.35
Skew After : -1.38
Range Before : 0.0, 626.4
Range After : 0.00, 6.44


### ORDINAL ENGAGEMENT TIER

0 – Dormant  : 0 minutes exactly \
1 – Low      : 1–100 min  (< ~1.7 hrs)   → sampler / trial \
2 – Medium   : 101–500 min (~2–8 hrs)     → casual learner \
3 – High     : 501–1500 min (~8–25 hrs)   → committed learner \
4 – Power    : >1500 min  (>25 hrs)       → power user, completionist

In [10]:
ENGAGEMENT_BINS = [-np.inf, 0, 100, 500, 1500, np.inf]
ENGAGEMENT_LABELS = [0, 1, 2, 3, 4]
ENGAGEMENT_NAMES = {0: 'Dormant', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Power'}

In [11]:
df['engagement_tier'] = pd.cut(
    df['minutes_watched'],
    bins=ENGAGEMENT_BINS,
    labels=ENGAGEMENT_LABELS
).astype(int)

In [12]:
tier_dist = df['engagement_tier'].value_counts().sort_index()
for tier, count in tier_dist.items():
    print(f"Tier {tier} - {ENGAGEMENT_NAMES[tier]:<8} : {count:>5} Students"
            f"({count/len(df) * 100:.2f}%)")

Tier 0 - Dormant  :    35 Students(0.92%)
Tier 1 - Low      :   641 Students(16.91%)
Tier 2 - Medium   :  1148 Students(30.28%)
Tier 3 - High     :  1023 Students(26.98%)
Tier 4 - Power    :   944 Students(24.90%)


### ORDINAL CLV TIER

Bins are anchored to the quartile distribution of CLV from the EDA: \
Q1 ≈ $62.58 →  Q2 (median) ≈ $119.00 →  Q3 ≈ $149.99

0 – Zero      : CLV = 0              → free / refunded

1 – Low       : 0 < CLV ≤ 62.58    → below Q1

2 – Mid-Low   : 62.58 < CLV ≤ 119  → Q1–Q2 (median band)

3 – Mid-High  : 119 < CLV ≤ 149.99 → Q2–Q3 band


4 – Premium   : CLV > 149.99         → above Q3

In [13]:
CLV_BINS = [-np.inf, 0, 62.58, 119.0, 149.99, np.inf]
CLV_LABELS = [0, 1, 2, 3, 4]
CLV_NAMES = {0: 'Zero', 1: 'Low', 2: 'Mid-Low', 3: 'Mid-High', 4: 'Premium'}

In [14]:
df['clv_tier'] = pd.cut(
    df['CLV'],
    bins=CLV_BINS,
    labels=CLV_LABELS
).astype(int)

In [15]:
clv_dist = df['clv_tier'].value_counts().sort_index()
for tier, count in clv_dist.items():
    print(f"Tier {tier} - {CLV_NAMES[tier]:<8} : {count:>5} Students"
            f"({count/len(df) * 100:.2f}%)")

Tier 0 - Zero     :    10 Students(0.26%)
Tier 1 - Low      :  1041 Students(27.46%)
Tier 2 - Mid-Low  :  1121 Students(29.57%)
Tier 3 - Mid-High :   673 Students(17.75%)
Tier 4 - Premium  :   946 Students(24.95%)


### ENGAGEMENT-VALUE RATIO (Descriptor Feature)

Captures the relationship between engagement and revenue. \
 Useful as a post-clustering descriptor and for flagging two archetypes: \
 HIGH ratio → highly engaged but low paying (upsell opportunity) \
 LOW ratio  → paid but barely watching      (churn risk)

In [16]:
# A small epsilon (1e-6) prevents division by zero for CLV=0 users.
df['engagement_value_ratio'] = df['log_minutes_watched'] / (df['log_clv'] + 1e-6)

In [17]:
# Cap extreme outliers at 99th percentile (CLV≈0 edge cases inflate ratio)
cap_99 = df['engagement_value_ratio'].quantile(0.99)
df['engagement_value_ratio'] = df['engagement_value_ratio'].clip(cap_99)
print(f"99th percentile cap applied : {cap_99:.2f}")
print(df['engagement_value_ratio'].describe().round(2))

99th percentile cap applied : 2.76
count       3791.00
mean       11962.71
std       282283.86
min            2.76
25%            2.76
50%            2.76
75%            2.76
max      9293393.93
Name: engagement_value_ratio, dtype: float64


### SCALING — StandardScaler on CLUSTERING FEATURES ONLY

CLUSTERING INPUTS: log_minutes, log_clv  →  scaled versions \
DESCRIPTOR FEATURES: region, channel, engagement_tier, clv_tier, engagement_value_ratio  →  NOT scaled / NOT clustered

StandardScaler chosen over MinMaxScaler because:
- More robust when distributions aren't perfectly bounded
- Consistent with K-Means assumptions (equal variance weighting)

In [18]:
CLUSTER_FEATURES = ['log_minutes_watched', 'log_clv']

In [19]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[CLUSTER_FEATURES])

In [20]:
df['log_minutes_watched_scaled'] = X_scaled[:, 0]
df['log_clv_scaled'] = X_scaled[:, 1]

In [21]:
print(f"Features scaled : {CLUSTER_FEATURES}")
print(f"Scaler means    : {scaler.mean_.round(4)}")
print(f"Scaler scales   : {scaler.scale_.round(4)}")
print(f"\n Post-scaling stats:")
print(df[['log_minutes_watched_scaled', 'log_clv_scaled']].describe().round(4))

Features scaled : ['log_minutes_watched', 'log_clv']
Scaler means    : [6.1064 4.5827]
Scaler scales   : [1.8687 0.7129]

 Post-scaling stats:
       log_minutes_watched_scaled  log_clv_scaled
count                   3791.0000       3791.0000
mean                       0.0000          0.0000
std                        1.0001          1.0001
min                       -3.2677         -6.4282
25%                       -0.5225         -0.6037
50%                        0.0763          0.2873
75%                        0.6440          0.6095
max                        3.4602          2.6075


In [23]:
# Sanity check
assert abs(df['log_minutes_watched_scaled'].mean()) < 1e-6, "Mean not ~0 after scaling!"
assert abs(df['log_clv_scaled'].std() - 1.0)  < 1e-3, "Std not ~1 after scaling!"
print("\n  ✓ Scaling sanity checks passed (mean≈0, std≈1)")


  ✓ Scaling sanity checks passed (mean≈0, std≈1)


### FINAL FEATURE MATRIX SUMMARY

In [24]:
FEATURE_REGISTRY = {
    'log_minutes_watched'           : ('Continuous', 'Clustering input',  'log1p(minutes_watched)'),
    'log_clv'               : ('Continuous', 'Clustering input',  'log1p(CLV)'),
    'log_minutes_watched_scaled'    : ('Scaled',     'Clustering input',  'StandardScaler(log_minutes)'),
    'log_clv_scaled'        : ('Scaled',     'Clustering input',  'StandardScaler(log_clv)'),
    'engagement_tier'       : ('Ordinal',    'Descriptor',        'Binned minutes_watched (0–4)'),
    'clv_tier'              : ('Ordinal',    'Descriptor',        'Binned CLV quartiles (0–4)'),
    'engagement_value_ratio': ('Continuous', 'Descriptor',        'log_minutes / log_clv'),
    'region'                : ('Categorical','Descriptor',        'Original — not encoded'),
    'channel'               : ('Categorical','Descriptor',        'Original — not encoded'),
}

print(f"\n  {'Feature':<26} {'Type':<12} {'Role':<18} {'Notes'}")
print(f"  {'-'*26} {'-'*12} {'-'*18} {'-'*35}")
for feat, (ftype, role, notes) in FEATURE_REGISTRY.items():
    print(f"  {feat:<26} {ftype:<12} {role:<18} {notes}")


  Feature                    Type         Role               Notes
  -------------------------- ------------ ------------------ -----------------------------------
  log_minutes_watched        Continuous   Clustering input   log1p(minutes_watched)
  log_clv                    Continuous   Clustering input   log1p(CLV)
  log_minutes_watched_scaled Scaled       Clustering input   StandardScaler(log_minutes)
  log_clv_scaled             Scaled       Clustering input   StandardScaler(log_clv)
  engagement_tier            Ordinal      Descriptor         Binned minutes_watched (0–4)
  clv_tier                   Ordinal      Descriptor         Binned CLV quartiles (0–4)
  engagement_value_ratio     Continuous   Descriptor         log_minutes / log_clv
  region                     Categorical  Descriptor         Original — not encoded
  channel                    Categorical  Descriptor         Original — not encoded


In [25]:
print(f"\n Final shape : {df.shape}")


 Final shape : (3791, 13)


### SAVE FEATURE MATRIX

In [26]:
df.to_csv("customer_segmentation_features.csv", index=False)

### FEATURE ENGINEERING DECISIONS SUMMARY

|  Feature | Type | Rationale |
| ---      | ---  | ---       |
| log_minutes | Continuous | log1p(minutes_watched): corrects extreme skew (20.68 → -0.58) |
| log_clv | Continuous | log1p(CLV): reduces tail influence (skew 1.35 → -1.38) |
| engagement_tier | Ordinal | Behavioural bucketing of watch time (0=Dormant → 4=Power) for post-cluster |
| clv_tier | Ordinal | Revenue bucketing on quartile breaks (0=Zero → 4=Premium) for post-cluster profiling & pricing decisions |
| engagement_value_ratio | Continuous | log_minutes / log_clv: separates high-engagement/low-pay vs low-engage/high-pay user archetypes (descriptor) |
| log_minutes_scaled | Scaled | StandardScaler on log_minutes: clustering input (zero mean, unit var)
| log_clv_scaled | Scaled | StandardScaler on log_clv: clustering input (zero mean, unit var)

**NOTE: region & channel** are DESCRIPTOR variables — used for post-clustering profiling ONLY. They are NOT inputs to the clustering algorithm.