In [27]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             RocCurveDisplay)
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])
df = df.drop('customerID', axis=1)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
#Feature 1: tenure_bucket
df['tenure_bucket']=pd.cut(df['tenure'],bins=[0,12,36,72],labels=['New','Established','Loyal'])

In [6]:
print(df['tenure_bucket'].value_counts())

tenure_bucket
Loyal          3001
New            2175
Established    1856
Name: count, dtype: int64


In [None]:
#Feature 2: total_services
service_cols = ['PhoneService', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

df['total_services']=df[service_cols].apply(lambda x: (x=='Yes').sum(),axis=1)

In [8]:
print(df['total_services'].value_counts().sort_index())

total_services
0      80
1    2247
2     996
3    1041
4    1060
5     825
6     524
7     259
Name: count, dtype: int64


In [9]:
print(df.groupby('total_services')['Churn'].mean().round(3))

total_services
0    0.438
1    0.217
2    0.435
3    0.347
4    0.273
5    0.221
6    0.126
7    0.058
Name: Churn, dtype: float64


In [10]:
#Feature 3: monthly_tenure_ratio
df['monthly_tenure_ratio']=df['MonthlyCharges'] / (df['tenure'] +1)

In [11]:
print(df.groupby('Churn')['monthly_tenure_ratio'].mean().round(2))

Churn
0     3.53
1    11.75
Name: monthly_tenure_ratio, dtype: float64


In [12]:
#Feature 4: is fiber
df["is_fiber"]=(df['InternetService']== 'Fiber optic').astype(int)

In [13]:
print(df.groupby('is_fiber')['Churn'].mean().round(3))

is_fiber
0    0.145
1    0.419
Name: Churn, dtype: float64


In [14]:
#Feature 5: has_security_support
df['has_security_support'] = ((df['OnlineSecurity'] == 'Yes') | (df['TechSupport'] == 'Yes')).astype(int)

In [15]:
print(df.groupby('has_security_support')['Churn'].mean().round(3))

has_security_support
0    0.335
1    0.171
Name: Churn, dtype: float64


In [16]:
#Feature 6: is_high_risk
df['is_high_risk'] = ((df['Contract'] == 'Month-to-month') & (df['tenure'] < 12)).astype(int)

In [17]:
print(df.groupby('is_high_risk')['Churn'].mean().round(3))

is_high_risk
0    0.171
1    0.519
Name: Churn, dtype: float64


In [22]:
X = df.drop('Churn', axis=1)
X = X.drop('tenure bucket', axis=1)
y = df['Churn']

# Updated column lists
num_columns = ['tenure', 'MonthlyCharges', 'TotalCharges', 
               'total_services', 'monthly_tenure_ratio']

cat_columns = X.select_dtypes(include='object').columns.tolist()
#tenure bucket is also categorical
cat_columns.append('tenure_bucket')

In [23]:
print("Numerical:", num_columns)
print("Categorical:", cat_columns)

Numerical: ['tenure', 'MonthlyCharges', 'TotalCharges', 'total_services', 'monthly_tenure_ratio']
Categorical: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_bucket']


In [24]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_columns),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_columns)
], remainder='passthrough')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed shape: {X_train_processed.shape}")
print(f"Feature names: {preprocessor.get_feature_names_out()}")

Processed shape: (5625, 37)
Feature names: ['num__tenure' 'num__MonthlyCharges' 'num__TotalCharges'
 'num__total_services' 'num__monthly_tenure_ratio' 'cat__gender_Male'
 'cat__Partner_Yes' 'cat__Dependents_Yes' 'cat__PhoneService_Yes'
 'cat__MultipleLines_No phone service' 'cat__MultipleLines_Yes'
 'cat__InternetService_Fiber optic' 'cat__InternetService_No'
 'cat__OnlineSecurity_No internet service' 'cat__OnlineSecurity_Yes'
 'cat__OnlineBackup_No internet service' 'cat__OnlineBackup_Yes'
 'cat__DeviceProtection_No internet service' 'cat__DeviceProtection_Yes'
 'cat__TechSupport_No internet service' 'cat__TechSupport_Yes'
 'cat__StreamingTV_No internet service' 'cat__StreamingTV_Yes'
 'cat__StreamingMovies_No internet service' 'cat__StreamingMovies_Yes'
 'cat__Contract_One year' 'cat__Contract_Two year'
 'cat__PaperlessBilling_Yes' 'cat__PaymentMethod_Credit card (automatic)'
 'cat__PaymentMethod_Electronic check' 'cat__PaymentMethod_Mailed check'
 'cat__tenure_bucket_Loyal' 'cat__te

In [25]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_processed, y_train)

lr_pred = lr_model.predict(X_test_processed)
lr_prob = lr_model.predict_proba(X_test_processed)[:, 1]

print("=== Logistic Regression (with Feature Engineering) ===")
print(f"Accuracy:  {accuracy_score(y_test, lr_pred):.4f}")
print(f"Precision: {precision_score(y_test, lr_pred):.4f}")
print(f"Recall:    {recall_score(y_test, lr_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, lr_pred):.4f}")
print(f"AUC-ROC:   {roc_auc_score(y_test, lr_prob):.4f}")

=== Logistic Regression (with Feature Engineering) ===
Accuracy:  0.7974
Precision: 0.6431
Recall:    0.5348
F1 Score:  0.5839
AUC-ROC:   0.8367


In [28]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_processed, y_train)

rf_pred = rf_model.predict(X_test_processed)
rf_prob = rf_model.predict_proba(X_test_processed)[:, 1]

print("=== Random Forest (with Feature Engineering) ===")
print(f"Accuracy:  {accuracy_score(y_test, rf_pred):.4f}")
print(f"Precision: {precision_score(y_test, rf_pred):.4f}")
print(f"Recall:    {recall_score(y_test, rf_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, rf_pred):.4f}")
print(f"AUC-ROC:   {roc_auc_score(y_test, rf_prob):.4f}")

=== Random Forest (with Feature Engineering) ===
Accuracy:  0.7861
Precision: 0.6229
Recall:    0.4947
F1 Score:  0.5514
AUC-ROC:   0.8167


## Feature Engineering Results

| Feature | Churn Signal | Result |
|---------|--------------|--------|
| tenure_bucket | New: 40%, Loyal: 10% | Redundant with tenure |
| total_services | 7 services: 6%, 0 services: 44% | Moderate signal |
| monthly_tenure_ratio | Churners 3x higher | Useful but not enough |
| is_fiber | Fiber: 42%, Other: 14% | Redundant with InternetService |
| has_security_support | With: 17%, Without: 34% | Moderate signal |
| is_high_risk | High risk: 52%, Other: 17% | Strong but redundant |

### Conclusion
Feature engineering improved AUC by only 0.0006 (0.8361 â†’ 0.8367).
Original features already captured key patterns.
Next step: Address class imbalance to improve recall.