In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import warnings
import os

warnings.filterwarnings('ignore')

print("Feature Engineering Pipeline Starting...")
print("="*80)

Feature Engineering Pipeline Starting...


In [2]:
# 1. LOAD DATA

df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f"✓ Data loaded: {df.shape}")

✓ Data loaded: (7043, 21)


In [3]:
# 2. DATA CLEANING

print("\nDATA CLEANING")
print("="*80)

# Fix TotalCharges (it has spaces that need to be handled)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check missing values
print(f"Missing values in TotalCharges: {df['TotalCharges'].isnull().sum()}")

# Fill missing TotalCharges (customers with 0 tenure)
df['TotalCharges'].fillna(0, inplace=True)

print("Data cleaning complete")
df.info()


DATA CLEANING
Missing values in TotalCharges: 11
Data cleaning complete
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract        

In [12]:
# 3. FEATURE ENGINEERING

print("\nFEATURE ENGINEERING")
print("="*80)

# Create a copy for feature engineering
df_fe = df.copy()

# 1. Customer Lifetime Value
df_fe['CLV'] = df_fe['MonthlyCharges'] * df_fe['tenure']
print("Created: Customer Lifetime Value (CLV)")

# 2. Average Monthly Spend
df_fe['AvgMonthlySpend'] = df_fe['TotalCharges'] / (df_fe['tenure'] + 1)  # +1 to avoid division by zero
print("Created: Average Monthly Spend")

# 3. Tenure Groups
df_fe['TenureGroup'] = pd.cut(df_fe['tenure'], 
                               bins=[-1, 12, 24, 48, np.inf],
                               labels=['0-1 year', '1-2 years', '2-4 years', '4+ years'])
print("Created: Tenure Groups")

# 4. Service Count (total number of services)
service_cols = ['PhoneService', 'InternetService', 'OnlineSecurity', 
                'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                'StreamingTV', 'StreamingMovies']

df_fe['ServiceCount'] = 0
for col in service_cols:
    df_fe['ServiceCount'] += (df_fe[col] == 'Yes').astype(int)

# Add internet service (not No)
df_fe['ServiceCount'] += (df_fe['InternetService'] != 'No').astype(int)
print("Created: Service Count")

# 5. Has Internet Service
df_fe['HasInternet'] = (df_fe['InternetService'] != 'No').astype(int)
print("Created: Has Internet Service")

# 6. Has Phone Service
df_fe['HasPhone'] = (df_fe['PhoneService'] == 'Yes').astype(int)
print("Created: Has Phone Service")

# 7. Has Premium Services
df_fe['HasPremiumServices'] = (
    (df_fe['OnlineSecurity'] == 'Yes') | 
    (df_fe['OnlineBackup'] == 'Yes') |
    (df_fe['DeviceProtection'] == 'Yes') |
    (df_fe['TechSupport'] == 'Yes')
).astype(int)
print("Created: Has Premium Services")

# 8. Has Streaming Services
df_fe['HasStreaming'] = (
    (df_fe['StreamingTV'] == 'Yes') | 
    (df_fe['StreamingMovies'] == 'Yes')
).astype(int)
print("Created: Has Streaming Services")

# 9. Contract Risk (Month-to-month is high risk)
df_fe['ContractRisk'] = (df_fe['Contract'] == 'Month-to-month').astype(int)
print("Created: Contract Risk")

# 10. Payment Risk (Electronic check is higher risk)
df_fe['PaymentRisk'] = (df_fe['PaymentMethod'] == 'Electronic check').astype(int)
print("Created: Payment Risk")

# 11. Is New Customer (less than 6 months)
df_fe['IsNewCustomer'] = (df_fe['tenure'] < 6).astype(int)
print("Created: Is New Customer")

# 12. Is Senior without Support
df_fe['SeniorNoSupport'] = (
    (df_fe['SeniorCitizen'] == 1) & 
    (df_fe['TechSupport'] == 'No')
).astype(int)
print("Created: Senior without Support")

# 13. High Value Customer
monthly_75th = df_fe['MonthlyCharges'].quantile(0.75)
df_fe['HighValueCustomer'] = (df_fe['MonthlyCharges'] > monthly_75th).astype(int)
print("Created: High Value Customer")

# 14. Charge to Tenure Ratio
df_fe['ChargeTenureRatio'] = df_fe['MonthlyCharges'] / (df_fe['tenure'] + 1)
print("Created: Charge to Tenure Ratio")

# 15. chage churn value yes : 1 and no : 0
df_fe['Churn'] = df_fe['Churn'].map({'Yes': 1, 'No': 0})

# 16. Drop non-numerical value
df_fe = df_fe.drop(['customerID'], axis = 1)

# 17. Convert Yes/No, Male/Female, No internet servive columns to 1/0
y_n_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines', 'PaperlessBilling']

for col in y_n_cols:
    df_fe[col] = df_fe[col].replace({'No internet service': 0, 'No phone service':0, 'No': 0, 'Yes': 1, 'Male': 1, 'Female': 0})

print(f"\nTotal features created: 14")
print(f"New dataset shape: {df_fe.shape}")



FEATURE ENGINEERING
Created: Customer Lifetime Value (CLV)
Created: Average Monthly Spend
Created: Tenure Groups
Created: Service Count
Created: Has Internet Service
Created: Has Phone Service
Created: Has Premium Services
Created: Has Streaming Services
Created: Contract Risk
Created: Payment Risk
Created: Is New Customer
Created: Senior without Support
Created: High Value Customer
Created: Charge to Tenure Ratio

Total features created: 14
New dataset shape: (7043, 34)


In [13]:
# 5. FEATURE SELECTION & IMPORTANCE PREVIEW

print("\nFEATURE OVERVIEW")
print("="*80)
print("\nAll features in the dataset:")
print(df_fe.columns.tolist())

print("\nFeature types:")
print(df_fe.dtypes.value_counts())

print("\nDataset info:")
print(df_fe.info())


FEATURE OVERVIEW

All features in the dataset:
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn', 'CLV', 'AvgMonthlySpend', 'TenureGroup', 'ServiceCount', 'HasInternet', 'HasPhone', 'HasPremiumServices', 'HasStreaming', 'ContractRisk', 'PaymentRisk', 'IsNewCustomer', 'SeniorNoSupport', 'HighValueCustomer', 'ChargeTenureRatio']

Feature types:
int64       25
float64      5
object       3
category     1
Name: count, dtype: int64

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   gender              7043 non-null   int64   
 1   SeniorCitizen       7043 non-nul

In [14]:
# 6. TRAIN-TEST SPLIT

print("\nSPLITTING DATA")
print("="*80)

# Separate features and target
X = df_fe.drop('Churn', axis=1)
y = df_fe['Churn']

# Split data (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())
print(y_train.value_counts(normalize=True) * 100)


SPLITTING DATA
Training set: (5634, 33)
Testing set: (1409, 33)

Class distribution in training set:
Churn
0    4139
1    1495
Name: count, dtype: int64
Churn
0    73.464679
1    26.535321
Name: proportion, dtype: float64


In [15]:
# 7. FEATURE SCALING

print("\nFEATURE SCALING")
print("="*80)

# Identify numerical columns to scale
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 
                 'CLV', 'AvgMonthlySpend', 'ChargeTenureRatio']

# Initialize scaler
scaler = StandardScaler()

# Fit on training data and transform both sets
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("Scaled numerical features using StandardScaler")
print(f"Features scaled: {numerical_cols}")



FEATURE SCALING
Scaled numerical features using StandardScaler
Features scaled: ['tenure', 'MonthlyCharges', 'TotalCharges', 'CLV', 'AvgMonthlySpend', 'ChargeTenureRatio']


In [16]:
# 9. Onehotencoding 

X_train_cat = X_train_scaled.copy()
X_test_cat = X_test_scaled.copy()

multi_cat_cols = ['InternetService', 'Contract', 'PaymentMethod', 'TenureGroup']

ohe = ColumnTransformer(
    transformers = [(
        'cat', 
        OneHotEncoder(drop='first', handle_unknown='ignore'), 
        multi_cat_cols
    )],
    remainder='passthrough'
)

X_train_enc = ohe.fit_transform(X_train_cat)
X_test_enc  = ohe.transform(X_test_cat)

ohe_cols = ohe.named_transformers_['cat'].get_feature_names_out(multi_cat_cols)

pass_through_cols = X_train.drop(columns=multi_cat_cols).columns

all_columns = np.concatenate([ohe_cols, pass_through_cols])

X_train_df = pd.DataFrame(X_train_enc, columns=all_columns)
X_test_df = pd.DataFrame(X_test_enc, columns=all_columns)

print(X_train_df.shape)
print(X_test_df.shape)

X_train_df

(5634, 39)
(1409, 39)


Unnamed: 0,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,TenureGroup_1-2 years,TenureGroup_2-4 years,TenureGroup_4+ years,...,HasInternet,HasPhone,HasPremiumServices,HasStreaming,ContractRisk,PaymentRisk,IsNewCustomer,SeniorNoSupport,HighValueCustomer,ChargeTenureRatio
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,-0.502363
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.119219
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.326197
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.346258
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.905424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.485009
5630,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.107946
5631,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,-0.204425
5632,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.565777


In [17]:
# 8. SAVE PROCESSED DATA
print("\nSAVING PROCESSED DATA")
print("="*80)


# Save to CSV
X_train_df.to_csv('../data/X_train.csv', index=False)
X_test_df.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

print("Saved: X_train.csv")
print("Saved: X_test.csv")
print("Saved: y_train.csv")
print("Saved: y_test.csv")

# Also save feature names for later use
feature_names = X_train_df.columns.tolist()
pd.DataFrame({'features': feature_names}).to_csv('../data/feature_names.csv', index=False)
print("Saved: feature_names.csv")


SAVING PROCESSED DATA
Saved: X_train.csv
Saved: X_test.csv
Saved: y_train.csv
Saved: y_test.csv
Saved: feature_names.csv


In [18]:
# 9. SUMMARY

print("FEATURE ENGINEERING SUMMARY")
print("="*80)

summary = {
    'Original Features': df.shape[1],
    'Engineered Features': 14,
    'Final Features': X_train_df.shape[1],
    'Training Samples': X_train_df.shape[0],
    'Testing Samples': X_test_df.shape[0],
    'Churn Rate (Train)': f"{(y_train.sum()/len(y_train)*100):.2f}%"
}

for key, value in summary.items():
    print(f"{key:.<30} {value}")

print("\nFeature engineering complete! Ready for modeling.")
print("="*80)


FEATURE ENGINEERING SUMMARY
Original Features............. 21
Engineered Features........... 14
Final Features................ 39
Training Samples.............. 5634
Testing Samples............... 1409
Churn Rate (Train)............ 26.54%

Feature engineering complete! Ready for modeling.


In [19]:
# 10. FEATURE IMPORTANCE PREVIEW (using correlation)

print("\nTOP 10 FEATURES CORRELATED WITH CHURN")
print("="*80)

numeric_df = df_fe.select_dtypes(include=['int64', 'float64'])

correlation_with_churn = numeric_df.corr()['Churn'].sort_values(ascending=False)
print(correlation_with_churn.head(10))  # Exclude Churn itself

print("\nBOTTOM 10 FEATURES (NEGATIVE CORRELATION)")
print("="*80)
print(correlation_with_churn.tail(10))


TOP 10 FEATURES CORRELATED WITH CHURN
Churn                1.000000
ChargeTenureRatio    0.411756
ContractRisk         0.405103
IsNewCustomer        0.308773
PaymentRisk          0.301919
HasInternet          0.227890
SeniorNoSupport      0.199215
MonthlyCharges       0.193356
PaperlessBilling     0.191825
SeniorCitizen        0.150889
Name: Churn, dtype: float64

BOTTOM 10 FEATURES (NEGATIVE CORRELATION)
HasPremiumServices   -0.059046
DeviceProtection     -0.066160
OnlineBackup         -0.082255
Partner              -0.150448
Dependents           -0.164221
TechSupport          -0.164674
OnlineSecurity       -0.171226
TotalCharges         -0.198324
CLV                  -0.198514
tenure               -0.352229
Name: Churn, dtype: float64
