In [10]:
import pandas as pd

df = pd.read_csv("/kaggle/input/cleanedchurneddata/cleaned_churn_data.csv")
print(f"✅ Data loaded. Shape: {df.shape}")
df.head()


✅ Data loaded. Shape: (7032, 20)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print(df['Churn'].value_counts())
print("✅ Churn column encoded")


Churn
0    5163
1    1869
Name: count, dtype: int64
✅ Churn column encoded


In [12]:
cat_cols = df.select_dtypes(include='object').columns.tolist()
print("Categorical columns:\n", cat_cols)


Categorical columns:
 ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [14]:
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
               'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']


In [15]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in binary_cols:
    df[col] = label_encoder.fit_transform(df[col])

print("✅ Binary categorical columns encoded using LabelEncoder")
df[binary_cols].head()


✅ Binary categorical columns encoded using LabelEncoder


Unnamed: 0,gender,Partner,Dependents,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling
0,0,1,0,0,0,2,0,0,0,0,1
1,1,0,0,1,2,0,2,0,0,0,0
2,1,0,0,1,2,2,0,0,0,0,1
3,1,0,0,0,2,0,2,2,0,0,0
4,0,0,0,1,0,0,0,0,0,0,1


In [16]:
# These are the non-binary categorical columns
one_hot_cols = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod']

# Apply OneHotEncoding
df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

print("✅ One-Hot Encoding applied")
print(f"Updated shape: {df.shape}")
df.head()


✅ One-Hot Encoding applied
Updated shape: (7032, 25)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,Churn,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,0,2,0,0,...,0,True,False,False,False,False,False,False,True,False
1,1,0,0,0,34,1,2,0,2,0,...,0,False,False,False,False,True,False,False,False,True
2,1,0,0,0,2,1,2,2,0,0,...,1,False,False,False,False,False,False,False,False,True
3,1,0,0,0,45,0,2,0,2,2,...,0,True,False,False,False,True,False,False,False,False
4,0,0,0,0,2,1,0,0,0,0,...,1,False,False,True,False,False,False,False,True,False


In [17]:
# Create feature interaction
df['MonthlyTenureRevenue'] = df['MonthlyCharges'] * df['tenure']

print("✅ Feature interaction 'MonthlyCharges * tenure' added as 'MonthlyTenureRevenue'")
df[['MonthlyCharges', 'tenure', 'MonthlyTenureRevenue']].head()


✅ Feature interaction 'MonthlyCharges * tenure' added as 'MonthlyTenureRevenue'


Unnamed: 0,MonthlyCharges,tenure,MonthlyTenureRevenue
0,29.85,1,29.85
1,56.95,34,1936.3
2,53.85,2,107.7
3,42.3,45,1903.5
4,70.7,2,141.4


In [18]:
from sklearn.preprocessing import StandardScaler

# Columns to scale
num_cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges', 'MonthlyTenureRevenue']

# Initialize scaler
scaler = StandardScaler()

# Apply scaling
df[num_cols_to_scale] = scaler.fit_transform(df[num_cols_to_scale])

print("✅ Numerical features scaled:")
df[num_cols_to_scale].head()


✅ Numerical features scaled:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,MonthlyTenureRevenue
0,-1.280248,-1.161694,-0.994194,-0.995034
1,0.064303,-0.260878,-0.17374,-0.153164
2,-1.239504,-0.363923,-0.959649,-0.960656
3,0.512486,-0.74785,-0.195248,-0.167649
4,-1.239504,0.196178,-0.940457,-0.945775


In [20]:
# List all columns containing 'Contract' or 'PaymentMethod'
contract_cols = [col for col in df.columns if 'Contract' in col]
payment_cols = [col for col in df.columns if 'PaymentMethod' in col]

print("Contract-related columns:", contract_cols)
print("PaymentMethod-related columns:", payment_cols)


Contract-related columns: ['Contract_One year', 'Contract_Two year']
PaymentMethod-related columns: ['PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [23]:
df['ChurnRiskScore'] = (
    -df['tenure'] +                           # Lower tenure = higher risk
    df['MonthlyCharges'] +                    # Higher monthly = higher risk
    (~(df['Contract_One year'].astype(bool) | df['Contract_Two year'].astype(bool))).astype(int) * 1.5 +  # Month-to-month = riskier
    df['PaymentMethod_Electronic check'].astype(int) * 1.0  # Known churn driver
)


In [24]:
print(df['ChurnRiskScore'].head())


0    2.618554
1   -0.325181
2    2.375581
3   -1.260337
4    3.935682
Name: ChurnRiskScore, dtype: float64


In [25]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Define X and y
X = df.drop('Churn', axis=1)
y = df['Churn']

# Initialize selector with mutual information and select top 20 features
selector = SelectKBest(score_func=mutual_info_classif, k=20)

# Fit selector to data and transform X
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]

print("Top selected features:\n", selected_features)


Top selected features:
 Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'MonthlyCharges',
       'TotalCharges', 'InternetService_Fiber optic', 'InternetService_No',
       'Contract_Two year', 'PaymentMethod_Electronic check',
       'MonthlyTenureRevenue', 'ChurnRiskScore'],
      dtype='object')


In [26]:
from sklearn.model_selection import train_test_split

# Using the selected features
X_final = df[selected_features]
y_final = df['Churn']

# Split data: 80% train, 20% test, stratify on churn for balanced split, set random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, stratify=y_final, random_state=42
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"Train churn distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Test churn distribution:\n{y_test.value_counts(normalize=True)}")


Train shape: (5625, 20), Test shape: (1407, 20)
Train churn distribution:
Churn
0    0.734222
1    0.265778
Name: proportion, dtype: float64
Test churn distribution:
Churn
0    0.734186
1    0.265814
Name: proportion, dtype: float64


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Initialize model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87      1033
           1       0.64      0.55      0.59       374

    accuracy                           0.80      1407
   macro avg       0.74      0.72      0.73      1407
weighted avg       0.79      0.80      0.79      1407

ROC AUC Score: 0.8365166096360219


In [29]:
df.to_csv('processed_churn_data.csv', index=False)
