# Time-Based Feature Engineering for Early Warning Churn

## Objective
Engineer proxy early-warning features that capture customer behavior patterns
indicative of near-term churn risk.

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/telco_customer_churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df['tenure_risk_bucket'] = pd.cut(
    df['tenure'],
    bins=[-1, 6, 12, 24, np.inf],
    labels=['very_new', 'new', 'established', 'long_term']
)

df['tenure_risk_bucket'].value_counts()

tenure_risk_bucket
long_term      3833
very_new       1481
established    1024
new             705
Name: count, dtype: int64

In [4]:
df['is_month_to_month'] = (df['Contract'] == 'Month-to-month').astype(int)
df['is_month_to_month'].mean()

np.float64(0.5501916796819537)

In [5]:
df['high_monthly_charge'] = (
    df['MonthlyCharges'] > df['MonthlyCharges'].quantile(0.75)
).astype(int)

df['high_monthly_charge'].mean()

np.float64(0.2496095413886128)

In [6]:
service_cols = [
    'PhoneService',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies'
]

df['service_count'] = df[service_cols].apply(
    lambda row: sum(row == 'Yes'),
    axis=1
)

df['low_engagement'] = (df['service_count'] <= 2).astype(int)
df[['service_count', 'low_engagement']].head()

Unnamed: 0,service_count,low_engagement
0,1,1
1,3,0
2,3,0
3,3,0
4,1,1


In [7]:
df['churn_label'] = (df['Churn'] == 'Yes').astype(int)
df['churn_label'].mean()

np.float64(0.2653698707936959)

In [8]:
feature_cols = [
    'tenure',
    'is_month_to_month',
    'high_monthly_charge',
    'low_engagement',
    'MonthlyCharges',
    'service_count'
]

X = df[feature_cols]
y = df['churn_label']

X.head(), y.head()

(   tenure  is_month_to_month  high_monthly_charge  low_engagement  \
 0       1                  1                    0               1   
 1      34                  0                    0               0   
 2       2                  1                    0               0   
 3      45                  0                    0               0   
 4       2                  1                    0               1   
 
    MonthlyCharges  service_count  
 0           29.85              1  
 1           56.95              3  
 2           53.85              3  
 3           42.30              3  
 4           70.70              1  ,
 0    0
 1    0
 2    1
 3    0
 4    1
 Name: churn_label, dtype: int64)

## Feature Rationale

- Tenure & tenure bucket: Captures early lifecycle risk
- Month-to-month contract: Enables easy exit
- High monthly charges: Indicates price sensitivity
- Low engagement: Low switching cost and weak attachment
- Service count: Depth of relationship