##   Problem Definition
Goal: Predict whether a customer will churn (leave the service) based on their profile and service usage.


# 1. Importing  Basic libraries like:
pandas,numpy,seaborn,matplotlib

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns




### For handling class imbalance
from imblearn.over_sampling import SMOTE,

Libraries for Preprocessing and model building 

### pip install imbalanced-learn,

from imblearn.over_sampling import SMOTE


#### from imblearn.over_sampling import SMOTE
## What is SMOTE?
SMOTE stands for Synthetic Minority Over-sampling Technique.

It helps balance the dataset by creating synthetic samples of the minority class (in our case, Churn = Yes) instead of just duplicating rows.
When to Use SMOTE:
After encoding and scaling

Only on training data (not test data!)

Best used with models like Logistic Regression, Random Forest, etc.

### train_test_split:
Splits your data into training and testing sets
### cross_val_score:	
Performs cross-validation to test model robustness
### Use of DecisionTreeClassifier:
To build a model that splits decisions based on feature values
#### Metric,	Tells You...,	When to Use:
accuracy_score	,Overall correctness	,Balanced datasets

confusion_matrix,	Detailed error types (FP, FN, etc.),	Any classification

classification_report,	(Precision, Recall, F1 per class),	Imbalanced datasets like churn

In [None]:
# For encoding categorical columns
from sklearn.preprocessing import LabelEncoder

# For handling class imbalance
from imblearn.over_sampling import SMOTE

# For splitting data & validating model
from sklearn.model_selection import train_test_split, cross_val_score

# For training models
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# For evaluating model performance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# For saving/loading trained model
import pickle


# 2. Data Loading and Understanding

In [None]:
#load Telco csv data to pandas Dataframe
df=pd.read_csv("Telco-Customer-Churn.csv")
df

In [None]:
df.shape

In [None]:
df.head()

In [None]:
pd.set_option("display.max_columns",None) 
# Show all columns in output

In [None]:
df.info()

In [None]:
#dropping column customer iD this is not required for churn model,


# Why is customerID not required?
# Because it is just an identifier, not a feature that influences churn.
df =df.drop(columns=["customerID"])
df

In [None]:
# printing the unique values in all columns

numerical_feature_list = ["tenure", "MonthlyCharges", "TotalCharges"] #used this because we don't want any numerical value there but only categorical value
for col in df.columns:
    if col not in numerical_feature_list:
        print(col,df[col].unique())
        print("-"*50)#It prints a horizontal line of 50 dashes:
# printing the unique values in all columns but manuaally
# print(df["gender"].unique())     can type for all but might be time consuming so we used for loop
    

In [None]:
df.info()

In [None]:
#converting object type data of total charges to int or float64
df["TotalCharges"] = df["TotalCharges"].astype(float) #will give error 

In [None]:
# so to avoid error we will do
df[df["TotalCharges"]==" "]


In [None]:
# len(df[df["TotalCharges"]==" "]) #or  # Or df["TotalCharges"].eq(" ").sum() # Check how many blank values exist

In [None]:
# df["TotalCharges"].replace({" ","0.0"}) # will depricated so use: # Replace blanks with NaN first (cleaner than replacing with 0)
# df["TotalCharges"]=df["TotalCharges"].replace(" ", pd.NA)
# Check which values are causing the issue


In [None]:
#step1 
# List all unique problematic values in the column
invalid_values = df[~df["TotalCharges"].astype(str).str.replace('.', '', 1).str.isdigit()]
print("Invalid TotalCharges rows:")
print(invalid_values["TotalCharges"].unique())


##  3.  Cleaning of data

In [None]:
#Step 2: Remove Only Invalid Entries
# Clean and convert safely .Ensured all values are strings (even if they were NaN or numbers).
# .str.strip() removes any spaces like " 45.6 " → "45.6".


df["TotalCharges"] = df["TotalCharges"].astype(str).str.strip()

# Remove known invalid patterns
df = df[~df["TotalCharges"].isin(["", " ", "<NA>", "nan", "NaN"])]

# Now convert to float safely
df["TotalCharges"] = df["TotalCharges"].astype(float)

# Reset index to keep things clean
df.reset_index(drop=True, inplace=True)

# Final check
print("✅ Cleaned! Data type:", df["TotalCharges"].dtype)


In [None]:
# Keep only rows where TotalCharges is numeric
df = df[df["TotalCharges"].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
df["TotalCharges"] = df["TotalCharges"].astype(float)


In [None]:
df.info()

### Distribution of target column:

Churn column (whether a customer has left or stayed)

In [None]:
#checking the distribution of target column

print(df["Churn"].value_counts())   #Run this to see how many customers churned vs. stayed:

# Insights:-
1. Customer ID rempved as it is not required for modelling.
2. No missing values in the dataset.
3. Missing values in TotalCharges column were replaced with 0
4. class imbalance identified in the target. this imbalance will be handled using techniques like SMOTE during model training.

#### Class imbalance becomes critical when we move to the Modeling phase.

## Here's how:
Problem: If you train a model without handling imbalance, it may predict mostly 'No' (majority class), and still get high accuracy — but it'll fail to detect churners.


#### solution:Apply techniques like:

SMOTE (Synthetic Minority Oversampling Technique) from imblearn

Adjusting class weights in classifiers

Undersampling the majority class (less common)




# 3.Exploratory Data Analysis (EDA):-


In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.head(2)

In [None]:
df.describe()

In [None]:
df.isnull().sum()

 ## Numerical Feature Analysis:
understand the distribution of Numerical features

In [None]:
def plot_histogram(df, column_name):
    sns.histplot(df[column_name], kde=True, color='skyblue', edgecolor='black')
    plt.title(f"Distribution of {column_name}")

    # Calculate mean and median values for the column
    col_mean = df[column_name].mean()
    col_median = df[column_name].median()

    # Add vertical lines for mean and median
    plt.axvline(col_mean, color="red", linestyle="--", label=f"Mean: {col_mean:.2f}")
    plt.axvline(col_median, color="green", linestyle="--", label=f"Median: {col_median:.2f}")

    # Label axes
    plt.xlabel(column_name)
    plt.ylabel("Count")
    plt.legend()
    plt.show()


In [None]:
plot_histogram(df, "MonthlyCharges")


In [None]:
plot_histogram(df, "TotalCharges")


In [None]:
plot_histogram(df, "tenure")


## what is scaling 
Bring features to the same scale
### Why?
Needed for distance-based models
### Where?
Logistic, SVM, KNN, Neural Nets
### How?
StandardScaler, MinMaxScaler etc.
### Advantage?	
    Fast convergence, balanced input
### Disadvantage?	
    Harder interpretation, not always needed

#### But scaling is important — especially if you're going to use:

Logistic Regression

SVM

KNN

Neural Networks

For tree-based models, scaling is not required.





In [None]:
from sklearn.preprocessing import StandardScaler

# select only numerical columns
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


### Detect Outliers in Numeric Columns by using **Boxplot**
Boxplots help us visually identify extreme values in features like:

MonthlyCharges,TotalCharges,tenure

## Why use a boxplot here?
Purpose	:Why it helps in this project


Detect Outliers	:Outliers can mislead models


Understand Spread:	Helps decide scaling or transformation


Compare Churn Groups:	See how churners differ from non-churners


Visual Summary	Quick view of median, IQR, and data skewness


In [None]:
def plot_boxplot(df,column_name):
    sns.boxplot(df[column_name])
    plt.title("Boxplot of(column_name")
    plt.ylabel(column_name)
    plt.show()

In [None]:
plot_boxplot(df,"tenure")

In [None]:
plot_boxplot(df,"MonthlyCharges")

In [None]:
plot_boxplot(df,"TotalCharges")

## correlation heatmap for numerical columns:
A correlation heatmap is a colored grid that shows the correlation coefficient (from -1 to 1) between pairs of numerical features.
Correlation Value:	Meaning

+1:	 Perfect positive correlation,

0:   No correlation,

-1:	 Perfect negative correlation,

 


In [12]:
#correlation matrix - heatmap
sns.heatmap(df[["tenure","MonthlyCharges","TotalCharges"]].corr(),annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

ValueError: could not convert string to float: ' '

# Categorical feature - Analysis:-

In [None]:
df.columns

In [None]:
df.info()

 ## Analyzing categorical features using count plot :-
 After analyzing numerical features, it’s important to analyze categorical features — especially because many important columns like gender, Contract, InternetService, etc., are categorical in your churn dataset.
 ### summary for categorical fetures:
 
 Find all categorical columns (like gender, contract, etc.) from your dataset using .select_dtypes(include="object").

Manually add SeniorCitizen to the list because even though it's stored as numbers (0 or 1), it actually represents a category (Yes/No).

Save them in a list called obj_cols so that later you can:

Plot graphs (like bar charts)

Encode them for machine learning models

### Why This Is Important:
You need to treat categorical features differently than numerical ones in:

Exploratory Data Analysis (EDA),Data Preprocessing (like Label Encoding / OneHot Encoding),Model building


In [None]:
# Step 1: Identify categorical columns
obj_cols = df.select_dtypes(include="object").columns.to_list()
# Step 2: Add 'SeniorCitizen' to treat it like a categorical feature
obj_cols = ["SeniorCitizen"] + obj_cols


# Plot count plots for each categorical column
for col in obj_cols:
    sns.countplot(x=df[col])
    plt.title(f"Count plot of {col}")
    plt.show()

## From above graph we observed ,Class Imbalance in Target Variable (Churn)
The Churn column is imbalanced – most customers did not churn.

Example: 75% "No", 25% "Yes"

### Why It's a Problem:
If we don’t handle it, models like Logistic Regression, Random Forest, etc. may:

Predict mostly “No Churn” to get high accuracy.

But miss actual churners, which is what businesses care about.

#### How to Handle It:

SMOTE:	Generates synthetic examples for the minority class (Churn = Yes)

Class weights:	Give more importance to the minority class

Evaluation Metrics	:Use F1-Score, Recall, and AUC instead of just Accuracy

# Data Preprocessing
Data preprocessing is the cleaning and preparation of raw data so that machine learning models can understand and use it.
## Why is it needed?
Because real-world data is often:Messy,Incomplete,Not in the right format (e.g., text instead of numbers).Without preprocessing, your model may give wrong predictions or fail to work.
## How it's done (Steps):
Remove irrelevant columns (like IDs),Handle missing values,Convert data types (e.g., string to float),Encode categorical variables (LabelEncoder/OneHotEncoder)

Scale numeric values (StandardScaler/MinMaxScaler),Handle imbalance (SMOTE),Split data into training & testing sets

### Advantages:
Better accuracy,Cleaner input for ML models,Avoids errors in training

### Disadvantages:
Can be time-consuming,Needs careful handling (mistakes can affect results)

In [13]:
df.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


## Label encoding of target column

In [14]:
df["Churn"] = df["Churn"].replace({"Yes":1,"No":0})

  df["Churn"] = df["Churn"].replace({"Yes":1,"No":0})


In [15]:
df.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1


In [16]:
df["Churn"].value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

## Label encoding for categorical features:


In [17]:
#Identifying columns with object datatype
obj_columns=df.select_dtypes(include="object").columns
obj_columns

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'TotalCharges'],
      dtype='object')

Note: we’ve already encoded Churn, so it won’t be in this list anymore.



# Label Encoding with Encoder Persistence
 Why It’s Done (Purpose):
Machine Learning models need all features in numerical form. Categorical columns like "gender" or "InternetService" are text — which models can’t interpret.

Also, to ensure the same encoding logic is applied later (on new unseen data or in deployment), we must save the encoders.
### Benefits
Ensures compatibility between training and prediction

Speeds up model deployment

Enables consistent results across environments
### When Not to Use LabelEncoder
For non-ordinal categorical features with more than 2 classes, prefer One-Hot Encoding.

Label encoding imposes an order, which might mislead some algorithms (like linear regression).

#### we are using tree-based models (like Decision Tree, XGBoost) which handle label encoding well.

You've saved the encoders — which is best practice for production-ready pipelines.

## summary for given below code :
Label Encoding with Encoder Saving
In this step, we:

Identified all categorical columns (data type = object)

Applied Label Encoding to convert text data (e.g., "Male", "Female") into numbers (e.g., 0, 1)

Stored each encoder used for every column in a dictionary

Saved all encoders using pickle so we can reuse the same logic later (especially during deployment)

✅ This ensures our machine learning model can understand the data, and we maintain consistency during training and prediction.

In [18]:
#initialize a dictionary to save the encoders:
encoders={}

#apply label encoding and store the encoders
for column in obj_columns:
    label_encoder=LabelEncoder()
    df[column]=label_encoder.fit_transform(df[column])
    encoders[column]=label_encoder


#save the encoders to a pickle file
with open("encoders.pkl","wb") as f:
    pickle.dump(encoders,f)

In [19]:
encoders

{'gender': LabelEncoder(),
 'Partner': LabelEncoder(),
 'Dependents': LabelEncoder(),
 'PhoneService': LabelEncoder(),
 'MultipleLines': LabelEncoder(),
 'InternetService': LabelEncoder(),
 'OnlineSecurity': LabelEncoder(),
 'OnlineBackup': LabelEncoder(),
 'DeviceProtection': LabelEncoder(),
 'TechSupport': LabelEncoder(),
 'StreamingTV': LabelEncoder(),
 'StreamingMovies': LabelEncoder(),
 'Contract': LabelEncoder(),
 'PaperlessBilling': LabelEncoder(),
 'PaymentMethod': LabelEncoder(),
 'TotalCharges': LabelEncoder()}

###  Encoders Summary

We applied `LabelEncoder` to all categorical (object) columns.  
The `encoders` dictionary now stores a separate encoder for each column.  
This ensures consistent label transformation during both training and prediction.

Example:
- `gender`: {'Female' → 0, 'Male' → 1}
- `Partner`: {'No' → 0, 'Yes' → 1}
- ...and so on for all 15 object columns.


In [20]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,2505,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1466,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,157,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1400,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,925,1


# Trainig and Test Data Split
Train your model on one portion of the data

Test it on unseen data to evaluate performance

Prevent overfitting and ensure fair evaluation



In [21]:
# splitting the fetures and target
X=df.drop(columns=["Churn"])    #x → all features (input columns)

y=df["Churn"]   #y → the target variable (Churn)


###  Train-Test Split

We split the dataset into training and test sets using `train_test_split`:

- 80% training data (`X_train`, `y_train`)
- 20% testing data (`X_test`, `y_test`)

- Set `random_state=42` to ensure reproducibility


In [22]:
# split datasets into trainig and test data
X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)  
# test_size=0.2	:-20% of the data is used for testing, 80% for training
# random_state=42:	Ensures you get the same split every time you run the code,it can be different 

In [23]:
y_train.shape

(5634,)

In [24]:
y_train.value_counts()   # it is imbalance so we use SMOTE

Churn
0    4138
1    1496
Name: count, dtype: int64

###  What is SMOTE?

SMOTE (Synthetic Minority Over-sampling Technique) is used to handle class imbalance in classification problems.

Instead of duplicating minority class samples, it creates **synthetic examples** by interpolating between existing ones.

This helps the model learn the patterns of both majority and minority classes effectively.

📌 We apply SMOTE **only to the training set**.

**Advantages:**
- Improves model performance on imbalanced data
- Avoids overfitting caused by simple duplication

**Disadvantages:**
- Might generate noisy samples if classes overlap

###  How SMOTE works?
Instead of just copying minority class samples (which can cause overfitting), SMOTE:

Identifies neighbors of a minority sample.

Interpolates (creates new synthetic samples) between them.

Adds these new samples to the training data.

Now the model sees a balanced view and learns better!

### ✅ SMOTE is applied only on the training data, not test data.


In [25]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE on the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the new shape of y after SMOTE

# print("Before SMOTE:", y_train)
print("After SMOTE:", y_train_smote.shape)



After SMOTE: (8276,)


In [26]:
print(y_train_smote.value_counts())

Churn
0    4138
1    4138
Name: count, dtype: int64


# 5. Model Trainig:
Trainig with default hyperparameters

Model training is the process where a machine learning algorithm learns patterns and relationships from historical data (called the training set) to make predictions or classifications on new, unseen data.


Here:
model.fit() is the training function.

It learns from X_train_smote and y_train_smote (your balanced training data).

Now the model can predict churn for new customers.
#### Why Training is Important?
This is where your model learns to make decisions.

The quality of training directly affects how accurate your predictions will be.

#### Behind the scenes:
Different algorithms use different methods:

Logistic Regression: Learns weights for linear decision boundary.

Decision Tree: Learns best splits.

XGBoost: Learns from gradients and combines multiple trees.




In [27]:
#dictionary of models
from sklearn.ensemble import RandomForestClassifier

models={
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
    
    
}

In [28]:
#dictionary to store the cross validations results
cv_scores={}

#perfrom 5-Folds Cross validations for each model using loop
for model_name,model in models.items():
    
    # print(model_name)
    # print(model)
    # print(","*50
    print(f"Training {model_name} with default parameter")
    scores=cross_val_score(model,X_train_smote,y_train_smote,cv=5,scoring="accuracy")
  
    cv_scores[model_name]=scores
    print(f"{model_name} cross-validation accuracy:{np.mean(scores):.2f}")
    print(","*70)

Training Decision Tree with default parameter
Decision Tree cross-validation accuracy:0.79
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Training Random Forest with default parameter
Random Forest cross-validation accuracy:0.84
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Training XGBoost with default parameter
XGBoost cross-validation accuracy:0.83
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


   #### scores=cross_val_score(model,X_train_smote,y_train_smote,cv=5,scoring="accuracy")
    This is the core operation:

Trains the current model using 5-fold cross-validation

X_train_smote, y_train_smote: Training data after balancing with SMOTE

cv=5: Splits the data into 5 folds

scoring="accuracy": Uses accuracy as the evaluation metric

It returns 5 accuracy scores (one from each fold).
## Why Is This Important?
Cross-validation gives a more reliable estimate of model performance than a single train-test split.

Prevents overfitting and underfitting detection early.

Helps in model comparison under similar training conditions.

In [31]:
cv_scores

# structure of cv_scores
# Each key in the dictionary is the name of the model (like "Decision Tree" or "XGBoost"), and the value is the list of 5 accuracy
# scores returned by cross_val_score() — one for each fold in 5-fold cross-validation.


{'Decision Tree': array([0.68417874, 0.72145015, 0.83141994, 0.84229607, 0.84773414]),
 'Random Forest': array([0.72584541, 0.76797583, 0.90876133, 0.89667674, 0.90030211]),
 'XGBoost': array([0.69746377, 0.74259819, 0.90996979, 0.89728097, 0.90513595])}

### 🔎 Understanding `cv_scores`

- `cv_scores` is a dictionary that stores the 5-fold cross-validation accuracy scores for each ML model.
- Key = Model Name (e.g., "XGBoost")
- Value = List of 5 accuracy scores from cross-validation
- Helps compare models based on their average performance.

This allows us to choose the most promising model for final evaluation on test data.
### RandomForest givrs the highest accuracy compared to other models with default parameters

In [40]:
# Initialize the model
rfc = RandomForestClassifier(random_state=42)
                            
# Train the model on the SMOTE-resampled data

rfc.fit(X_train_smote,y_train_smote)

# 6. Model Evaluation
Model Evaluation is the process of checking how well your machine learning model performs on unseen data (the test set).
In other words, you test how well your model generalizes beyond the data it was trained on.
### Why Do We Evaluate a Model?
Because just training a model isn’t enough — we need to make sure:

It isn’t overfitting (too specific to training data)

It performs well on new, real-world data

We choose the best algorithm and tuned parameters before deploying
### When Do We Evaluate the Model?
After training your model on the training data, you:

Use it to predict on the test data

Compare the predictions with the actual test labels

Use evaluation metrics to measure performance





In [37]:
# This shows the distribution of actual labels in your test set:

y_test.value_counts()
# 0    1036  # Customers who did not churn (majority class)
# 1     373  # Customers who churned (minority class)
# This confirms the test set is also imbalanced, just like the full dataset.

Churn
0    1036
1     373
Name: count, dtype: int64

In [55]:
#evaluate on test data

 # Making Predictions
y_test_pred = rfc.predict(X_test)
# You used the final trained model to predict on X_test.

# This generates predictions (y_test_pred) which are then compared with actual values (y_test).



print("Accuracy Score:\n", accuracy_score(y_test,y_test_pred))
# Be careful: With imbalanced datasets, high accuracy can be misleading. For example, predicting all 0 would give ~73% accuracy here (since 1036 out of 1409 are 0).
print("Confusion Matrix:\n", confusion_matrix(y_test,y_test_pred))
print("Classification Report:\n", classification_report(y_test,y_test_pred))

Accuracy Score:
 0.7757274662881476
Confusion Matrix:
 [[877 159]
 [157 216]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      1036
           1       0.58      0.58      0.58       373

    accuracy                           0.78      1409
   macro avg       0.71      0.71      0.71      1409
weighted avg       0.78      0.78      0.78      1409



Metric:	Meaning

Precision:	Out of predicted 1s, how many were actually 1 (churn)?

Recall:	Out of actual 1s, how many were correctly predicted as 1?

F1-score	:Harmonic mean of Precision and Recall (balance between the two)

Support	:Number of actual instances for each class (0 and 1)


In [56]:
#save the trained model as a pickle file

model_data={"model":rfc,"feature_names":X.columns.tolist}

with open ("customer_chunr_model.pkl","wb") as f:
    pickle.dump(model_data, f)

Explanation:

with open("customer_churn_model.pkl", "wb") as f:	Opens a new file named customer_churn_model.pkl in write-binary mode. This is where the model will be saved.


pickle.dump(rfc, f):	Dumps (saves) the trained Random Forest model (rfc) into the file f using the pickle module.

# 7. Load and saved model and build a predictive system 

In [59]:
#load the saved model and the encoders

with open ("customer_chunr_model.pkl","rb") as f:
    model_data=pickle.load(f)

loaded_model=model_data["model"]
feature_names=model_data["feature_names"]
print(loaded_model)
print(feature_names)

RandomForestClassifier(random_state=42)
<bound method IndexOpsMixin.tolist of Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')>


In [72]:
import pandas as pd
#  This creates a one-row DataFrame (input_data_df) from the new customer's details.

# The dictionary keys are column names; values are customer inputs.


input_data = {'gender': 'Female', 'SeniorCitizen': '0', 'Partner': 'Yes', 'Dependents': 'No', 'tenure': '1', 'PhoneService': 'No', 'MultipleLines': 'No phone service', 'InternetService': 'DSL', 'OnlineSecurity': 'No', 'OnlineBackup': 'Yes', 'DeviceProtection': 'No', 'TechSupport': 'No', 'StreamingTV': 'No', 'StreamingMovies': 'No', 'Contract': 'Month-to-month', 'PaperlessBilling': 'Yes', 'PaymentMethod': 'Electronic check', 'MonthlyCharges': '29.85', 'TotalCharges': '29.85'}

### eature Encoding for Inference
In simple terms:

You're applying the same preprocessing steps (label encoding, etc.) to new/unseen data as you did during training, to prepare it for making predictions.

#### Specifically, this step is called:
Label Encoding for Inference

Transforming New Data with Trained Encoders

Inference-Time Feature Transformation

Applying Consistent Encoding

#### Why It Matters:
When training a machine learning model, you encode categorical features (like "Yes", "No", "Month-to-month") into numerical values.

At inference time (i.e., when using the model to predict), you must apply exactly the same encodings.

Otherwise, the model will get confused and return errors or incorrect results.



In [76]:

input_data_df=pd.DataFrame([input_data])
with open("encoders.pkl","rb") as f:
    encoders=pickle.load(f)


print(input_data_df.head())
#encode categorical features using saved encoders
for column,encoder in encoders.items():
    input_data_df[column]=encoder.transform(input_data_df[column])
     # This loop encodes each categorical column in your new input using the same encoder used during training.
# Applies the correct label encoder for that column.

# Replaces the string values in input_data_df with their numeric equivalents.


# It ensures consistency between training data and new input

   gender SeniorCitizen Partner Dependents tenure PhoneService  \
0  Female             0     Yes         No      1           No   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   

  PaperlessBilling     PaymentMethod MonthlyCharges TotalCharges  
0              Yes  Electronic check          29.85        29.85  


In [78]:
input_data_df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,2505


In [88]:
#make a prediction
prediction = loaded_model.predict(input_data_df)
pred_prob = loaded_model.predict_proba(input_data_df)
print(prediction)


#results
print(f"Prediction:{'Churn' if prediction[0]==1 else 'No Churn'}")
print(f"Prediction Probability:{pred_prob}")

[0]
Prediction:No Churn
Prediction Probability:[[0.82 0.18]]


## Why It’s Important:
Your model can only predict on numerical data — it doesn't understand strings like "Yes", "Month-to-month", etc.

This step ensures your new input matches the format your model expects.


### This entire flow is called:

*Inference*-Time Feature Encoding and Model Prediction
— making real-world predictions on new data using your trained and saved ML model