In [4]:
# Ensemble model with stacking and balanced class weights
!pip install kagglehub xgboost imbalanced-learn --quiet

import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import os

# Load dataset
dataset_path = kagglehub.dataset_download("blastchar/telco-customer-churn")
file_path = os.path.join(dataset_path, "WA_Fn-UseC_-Telco-Customer-Churn.csv")
print("Dataset path:", file_path)

data = pd.read_csv(file_path)

# Preprocess data
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)
data = data.drop('customerID', axis=1)

# Encode categorical columns
for col in data.select_dtypes(include='object').columns:
    if col != 'Churn':
        data[col] = LabelEncoder().fit_transform(data[col])
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

# Features and labels
X = data.drop('Churn', axis=1)
y = data['Churn']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Address class imbalance with SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_scaled, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', learning_rate=0.1, max_depth=5, n_estimators=200, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42))
]

# Meta model
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

# Train stacked model
stack_model.fit(X_train, y_train)
y_pred = stack_model.predict(X_test)

# Evaluate
print("\n✅ Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# New customer prediction
new_customer = np.zeros((1, X.shape[1]))
new_customer[0, X.columns.get_loc('tenure')] = 10
new_customer[0, X.columns.get_loc('MonthlyCharges')] = 70
new_customer[0, X.columns.get_loc('TotalCharges')] = 700
new_customer_scaled = scaler.transform(new_customer)

prediction = stack_model.predict(new_customer_scaled)[0]
print("\n🔍 Churn Prediction for New Customer (1 = Yes, 0 = No):", prediction)


Dataset path: /kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)



✅ Model Evaluation:
Accuracy: 0.8565217391304348
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      1021
           1       0.85      0.87      0.86      1049

    accuracy                           0.86      2070
   macro avg       0.86      0.86      0.86      2070
weighted avg       0.86      0.86      0.86      2070


🔍 Churn Prediction for New Customer (1 = Yes, 0 = No): 0


