In [1]:
import pandas as pd
import numpy as np

# For building the preprocessing and modeling pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline # Special pipeline for SMOTE

# For data preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# For handling class imbalance
from imblearn.over_sampling import SMOTE

# Machine Learning Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier

# For evaluating the model's performance
from sklearn.metrics import classification_report, accuracy_score ,f1_score

# For saving the final model
import pickle

# To ignore warning messages for a cleaner output
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset from the CSV file
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
# Drop the customerID column as it is not needed for prediction
df = df.drop(columns=["customerID"])

In [4]:
# Convert 'TotalCharges' to a numeric type. 'coerce' will turn any non-numeric values into NaN.
df["TotalCharges"] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:

# Drop any rows that have missing values (this removes rows where TotalCharges was NaN)
df.dropna(inplace=True)

In [6]:

# Map the target variable 'Churn' to numbers (1 for Yes, 0 for No)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [7]:
# Define the exact features you want the model to use
selected_features = [
    'Contract',
    'tenure',
    'OnlineSecurity',
    'PaymentMethod',
    'MonthlyCharges',
    'TotalCharges',
    'TechSupport',
    'InternetService',
    'Dependents',
    'Partner',
    'gender'
]

print(f"✅ Using {len(selected_features)} selected features for modeling.")

✅ Using 11 selected features for modeling.


In [8]:
# Create the feature matrix (X) using only your selected columns
X = df[selected_features]

# Create the target vector (y) which is the 'Churn' column
y = df['Churn']


In [9]:
# Split the data into 80% for training and 20% for testing
# stratify=y ensures the proportion of churn vs. no-churn is the same in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
# Automatically identify which of your selected columns are numerical and which are categorical
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

print(f"\nIdentified Numerical columns: {list(numerical_cols)}")
print(f"Identified Categorical columns: {list(categorical_cols)}")



Identified Numerical columns: ['tenure', 'MonthlyCharges', 'TotalCharges']
Identified Categorical columns: ['Contract', 'OnlineSecurity', 'PaymentMethod', 'TechSupport', 'InternetService', 'Dependents', 'Partner', 'gender']


In [12]:
# Create the preprocessor object using ColumnTransformer
# This applies StandardScaler to numbers and OneHotEncoder to text columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [13]:
# Create the full pipeline using imblearn's Pipeline to correctly handle SMOTE
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42)) # Using RandomForest as it was your best model
])

# Train the entire pipeline with a single .fit() call
print("--- Training the model pipeline... ---")
model_pipeline.fit(X_train, y_train)
print("✅ Training complete.")

--- Training the model pipeline... ---
✅ Training complete.


In [26]:
model_pipeline.fit(X_train, y_train)


In [14]:
# Define the models and their respective hyperparameter grids
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost RF": XGBRFClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)
}

param_grids = {
    "Decision Tree": {
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__max_depth': [5, 10, None],
        'classifier__min_samples_split': [2, 5, 10]
    },
    "Random Forest": {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__min_samples_split': [2, 5]
    },
    "XGBoost RF": {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 6],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__subsample': [0.8, 1.0]
    }
}

# Variables to store the best model's information
best_model_name = None
best_model_estimator = None
best_model_score = 0.0

# --- Loop through each model to perform GridSearchCV ---
for name, model in models.items():
    print(f"\n--- Tuning hyperparameters for: {name} ---")
    
    # Create a full pipeline with SMOTE using ImbPipeline
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)), # Add SMOTE here
        ('classifier', model)
    ])
    
    # Set up and run the grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grids[name],
        cv=10, # 10-fold cross-validation
        scoring='roc_auc',
        n_jobs=-1, # Use all available CPU cores
        verbose=1
    )
    # Fit on the original (imbalanced) training data
    grid_search.fit(X_train, y_train)
    
    print(f"✅ Best Parameters for {name}: {grid_search.best_params_}")
    print(f"🏆 Best CV Accuracy for {name}: {grid_search.best_score_:.4f}")
    
    # Check if this model is the best one so far
    if grid_search.best_score_ > best_model_score:
        best_model_score = grid_search.best_score_
        best_model_name = name
        best_model_estimator = grid_search.best_estimator_

print(f"\n\n=======================================================")
print(f"🥇 Overall Best Model Selected: {best_model_name}")
print(f"🏅 Best Cross-Validation Accuracy: {best_model_score:.4f}")
print(f"=======================================================")


--- Tuning hyperparameters for: Decision Tree ---
Fitting 10 folds for each of 18 candidates, totalling 180 fits
✅ Best Parameters for Decision Tree: {'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_split': 2}
🏆 Best CV Accuracy for Decision Tree: 0.8254

--- Tuning hyperparameters for: Random Forest ---
Fitting 10 folds for each of 12 candidates, totalling 120 fits
✅ Best Parameters for Random Forest: {'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
🏆 Best CV Accuracy for Random Forest: 0.8410

--- Tuning hyperparameters for: XGBoost RF ---
Fitting 10 folds for each of 16 candidates, totalling 160 fits
✅ Best Parameters for XGBoost RF: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 6, 'classifier__n_estimators': 200, 'classifier__subsample': 0.8}
🏆 Best CV Accuracy for XGBoost RF: 0.8398


🥇 Overall Best Model Selected: Random Forest
🏅 Best Cross-Validation Accuracy: 0.8410


In [15]:
# The 'best_model_estimator' is the final, tuned pipeline with SMOTE
# Evaluate this champion model on the test set
y_pred = best_model_estimator.predict(X_test)

print("\n--- Final Champion Model Evaluation on Test Set ---")
print(classification_report(y_test, y_pred))

# Save the best pipeline to a file
with open('champion_churn_model_pipeline_with_smote.pkl', 'wb') as f:
    pickle.dump(best_model_estimator, f)

print("\n✅ Champion model pipeline with SMOTE saved successfully.")


--- Final Champion Model Evaluation on Test Set ---
              precision    recall  f1-score   support

           0       0.89      0.76      0.82      1033
           1       0.53      0.74      0.62       374

    accuracy                           0.76      1407
   macro avg       0.71      0.75      0.72      1407
weighted avg       0.79      0.76      0.77      1407


✅ Champion model pipeline with SMOTE saved successfully.


In [16]:
best_model_estimator

In [17]:
# Save the final, complete pipeline object to a file
with open('customer_churn_pipeline.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

print("\n✅ Full pipeline saved successfully as 'customer_churn_pipeline.pkl'")


✅ Full pipeline saved successfully as 'customer_churn_pipeline.pkl'


In [18]:

# --- 1. Load the Saved Pipeline ---
# This single file contains your preprocessor, SMOTE logic, and the trained model.
pipeline_filename = 'champion_churn_model_pipeline_with_smote.pkl'
with open(pipeline_filename, 'rb') as f:
    loaded_pipeline = pickle.load(f)

print(f"✅ Pipeline '{pipeline_filename}' loaded successfully.")


✅ Pipeline 'champion_churn_model_pipeline_with_smote.pkl' loaded successfully.


In [19]:
# --- 2. Define New Data for Prediction ---
# This new data is in a raw format, just like your original CSV file.
# You can change these values to test different customer profiles.
new_customer_data = {
    'Contract': 'Month-to-month',
    'tenure': 1,
    'OnlineSecurity': 'No',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 75.0,
    'TotalCharges': 75.0,
    'TechSupport': 'No',
    'InternetService': 'Fiber optic',
    'Dependents': 'No',
    'Partner': 'No',
    'gender': 'Female'
}

# Convert the dictionary to a pandas DataFrame
# The model expects a DataFrame as input, even for a single prediction.
input_df = pd.DataFrame([new_customer_data])


# --- 3. Make a Prediction ---
# The pipeline automatically handles all the preprocessing steps for you.
prediction = loaded_pipeline.predict(input_df)
pred_prob = loaded_pipeline.predict_proba(input_df)




In [20]:
# --- 4. Display the Result ---
# The output of .predict() is an array, so we take the first element [0].
churn_status = 'Churn' if prediction[0] == 1 else 'No Churn'

# The probabilities are also an array of [prob_no_churn, prob_churn].
prob_no_churn = pred_prob[0][0]
prob_churn = pred_prob[0][1]

print("\n--- Prediction for New Customer ---")
print(f"Predicted Outcome: {churn_status}")
print(f"Probability of Not Churning: {prob_no_churn:.2%}")
print(f"Probability of Churning: {prob_churn:.2%}")


--- Prediction for New Customer ---
Predicted Outcome: Churn
Probability of Not Churning: 6.66%
Probability of Churning: 93.34%


In [21]:

# --- 1. Load the Saved Pipeline ---
# This single file contains your preprocessor, SMOTE logic, and the trained model.
pipeline_filename = 'champion_churn_model_pipeline_with_smote.pkl'
with open(pipeline_filename, 'rb') as f:
    loaded_pipeline = pickle.load(f)

print(f"✅ Pipeline '{pipeline_filename}' loaded successfully.")



✅ Pipeline 'champion_churn_model_pipeline_with_smote.pkl' loaded successfully.


In [22]:

# --- 2. Define New Data for Prediction ---
# This new data is in a raw format with text values, just like your original CSV file.
# You can change these values to test different customer profiles.
new_customer_data = {
    'Contract': 'Month-to-month',
    'tenure': 1,
    'OnlineSecurity': 'No',
    'PaymentMethod': 'Electronic check',
    'TechSupport': 'No',
    'InternetService': 'Fiber optic',
    'Dependents': 'No',
    'Partner': 'No',
    'gender': 'Female',
    'MonthlyCharges': 75.0, # Included as it was in your training features
    'TotalCharges': 75.0   # Included as it was in your training features
}

# Convert the dictionary to a pandas DataFrame.
# The model expects a DataFrame as input, even for a single prediction.
input_df = pd.DataFrame([new_customer_data])




In [23]:
# --- 3. Make a Prediction ---
# The pipeline automatically handles all the preprocessing steps for you.
prediction = loaded_pipeline.predict(input_df)
pred_prob = loaded_pipeline.predict_proba(input_df)



In [24]:

# --- 4. Display the Result ---
# The output of .predict() is an array, so we take the first element [0].
churn_status = 'Churn' if prediction[0] == 1 else 'No Churn'



In [25]:
# The probabilities are also an array of [prob_no_churn, prob_churn].
prob_no_churn = pred_prob[0][0]
prob_churn = pred_prob[0][1]

print("\n--- Prediction for New Customer ---")
print(f"Predicted Outcome: {churn_status}")
print(f"Probability of Not Churning: {prob_no_churn:.2%}")
print(f"Probability of Churning: {prob_churn:.2%}")


--- Prediction for New Customer ---
Predicted Outcome: Churn
Probability of Not Churning: 6.66%
Probability of Churning: 93.34%
