In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/Telco-Customer-Churn.csv")

In [36]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

In [37]:
df["churn_flag"] = (df["Churn"] == "Yes").astype(int)

In [38]:
df["is_new_customer"] = (df["tenure"] <= 6).astype(int)

In [39]:
df["tenure_bucket"] = pd.cut(
    df["tenure"],
    bins=[0, 6, 12, 24, 72],
    labels=["0-6", "6-12", "12-24", "24+"]
)

In [40]:
df["charges_per_month"] = df["TotalCharges"] / (df["tenure"] + 1)

In [41]:
df["high_price"] = (df["MonthlyCharges"] > df["MonthlyCharges"].median()).astype(int)


In [42]:
df["early_high_price"] = (
    (df["tenure"] <= 6) & 
    (df["MonthlyCharges"] > df["MonthlyCharges"].median())
).astype(int)

In [43]:
service_cols = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]

df["service_count"] = (df[service_cols] == "Yes").sum(axis=1)

In [44]:
df["low_engagement"] = (df["service_count"] <= 1).astype(int)

In [45]:
df["is_monthly_contract"] = (df["Contract"] == "Month-to-month").astype(int)

In [46]:
df["long_term_contract"] = df["Contract"].isin(["One year", "Two year"]).astype(int)

In [47]:
df["is_autopay"] = df["PaymentMethod"].str.contains("automatic").astype(int)

In [48]:
df["payment_friction"] = (df["is_autopay"] == 0).astype(int)

In [49]:
df["has_partner"] = (df["Partner"] == "Yes").astype(int)

In [50]:
df["has_dependents"] = (df["Dependents"] == "Yes").astype(int)

In [51]:
df["household_stability"] = df["has_partner"] + df["has_dependents"]

In [52]:
df["household_stability"] = df["has_partner"] + df["has_dependents"]

In [53]:
df["is_fiber"] = (df["InternetService"] == "Fiber optic").astype(int)

In [54]:
df["premium_user"] = (
    (df["is_fiber"] == 1) & 
    (df["MonthlyCharges"] > df["MonthlyCharges"].median())
).astype(int)

In [55]:
drop_cols = [
    "customerID", "Churn",
    "Partner", "Dependents",
    "Contract", "PaymentMethod",
    "InternetService"
]

df_model = df.drop(columns=drop_cols)

In [56]:
df_model["TotalCharges"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_model["TotalCharges"].fillna(0, inplace=True)


In [57]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,low_engagement,is_monthly_contract,long_term_contract,is_autopay,payment_friction,has_partner,has_dependents,household_stability,is_fiber,premium_user
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,1,1,0,0,1,1,0,1,0,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,0,0,1,0,1,0,0,0,0,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,0,1,0,0,1,0,0,0,0,0
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,0,0,1,1,0,0,0,0,0,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,1,1,0,0,1,0,0,0,1,1


In [58]:
df_model.to_csv("../data/processed/churn_features.csv", index=False)