In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("F:/subscription-churn-system/dataset/cleaned_telco_churn.csv")
np.random.seed(42)

df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
count,7043.0,7043.0,7043.0,7032.0,7043.0
mean,0.162147,32.371149,64.761692,2283.300441,0.26537
std,0.368612,24.559481,30.090047,2266.771362,0.441561
min,0.0,0.0,18.25,18.8,0.0
25%,0.0,9.0,35.5,401.45,0.0
50%,0.0,29.0,70.35,1397.475,0.0
75%,0.0,55.0,89.85,3794.7375,1.0
max,1.0,72.0,118.75,8684.8,1.0


In [3]:
#Average Weekly Usage
df["avg_weekly_sessions"] = np.clip(
    np.random.normal(loc=5 + df["tenure"]/12, scale=2),
    0,
    None
)

In [4]:
#Usage Trend (Drop-off is deadly)
df["usage_trend_30d"] = np.random.normal(
    loc=np.where(df["Churn"] == 1, -0.3, 0.1),
    scale=0.2
)
#Negative trend → disengaging user

In [5]:
#Support Tickets
df["support_tickets_90d"] = np.random.poisson(
    lam=np.where(df["Churn"] == 1, 2, 0.5)
)

In [6]:
#Payment Failures
df["payment_failures"] = np.random.binomial(
    n=3,
    p=np.where(df["Churn"] == 1, 0.4, 0.1)
)

In [7]:
#Engagement Score (Composite Feature)
df["engagement_score"] = (
    0.4 * df["avg_weekly_sessions"] +
    0.3 * (1 + df["usage_trend_30d"]) +
    0.3 * (1 / (1 + df["support_tickets_90d"]))
)

In [8]:
#Drop original churn label after using it only to simulate behavior.
df["churn_30d"] = df["Churn"]
df = df.drop(columns=["Churn"])

In [9]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,avg_weekly_sessions,usage_trend_30d,support_tickets_90d,payment_failures,engagement_score,churn_30d
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,Yes,Electronic check,29.85,29.85,6.076762,-0.204467,0,0,2.969365,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,Mailed check,56.95,1889.5,7.556805,0.259595,1,0,3.5506,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,Yes,Mailed check,53.85,108.15,6.462044,-0.411022,1,1,2.911511,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,Bank transfer (automatic),42.3,1840.75,11.79606,0.102434,0,1,5.349154,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,Yes,Electronic check,70.7,151.65,4.69836,-0.318575,1,1,2.233771,1


In [10]:
#Encode Categorical Variables
categorical_cols = df.select_dtypes(include="object").columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df_encoded.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,avg_weekly_sessions,usage_trend_30d,support_tickets_90d,payment_failures,engagement_score,churn_30d,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,6.076762,-0.204467,0,0,2.969365,0,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,7.556805,0.259595,1,0,3.5506,0,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,6.462044,-0.411022,1,1,2.911511,1,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,11.79606,0.102434,0,1,5.349154,0,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,4.69836,-0.318575,1,1,2.233771,1,...,False,False,False,False,False,False,True,False,True,False


In [11]:
#Train / Validation Split (Time-Aware)
df_encoded = df_encoded.sort_values("tenure")

split_idx = int(len(df_encoded) * 0.8)
train = df_encoded.iloc[:split_idx]
valid = df_encoded.iloc[split_idx:]

X_train = train.drop("churn_30d", axis=1)
y_train = train["churn_30d"]

X_valid = valid.drop("churn_30d", axis=1)
y_valid = valid["churn_30d"]

In [16]:
# Save Features
train.to_csv("F:/subscription-churn-system/data/processed/train_features.csv", index=False)
valid.to_csv("F:/subscription-churn-system/data/processed/valid_features.csv", index=False)