# **Importing Dependenices**

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("telco_customer_churn.csv")
df

In [None]:
df.shape

In [None]:
df.head()

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
df.info()

In [None]:
# dropping customerID column as this is not required for modelling
df = df.drop(columns=["customerID"])

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print(df["gender"].unique())

In [None]:
print(df["SeniorCitizen"].unique())

In [None]:
# printing the unique values in all the columns

numerical_features_list = ["tenure", "MonthlyCharges", "TotalCharges"]

for col in df.columns:
  if col not in numerical_features_list:
    print(col, df[col].unique())
    print("-"*50)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
df[df["TotalCharges"]==" "]

In [None]:
len(df[df["TotalCharges"]==" "])

In [None]:
df["TotalCharges"] = df["TotalCharges"].replace({" ": "0.0"})

In [None]:
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# checking the class distribution of target column
df["Churn"].value_counts()

## Exploratory Data Analysis(EDA)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(2)

In [None]:
df.describe()

In [None]:
cat_cols = df.select_dtypes(include=["object"]).columns
num_cols = df.select_dtypes(exclude=["object"]).columns

print("cat_cols", cat_cols)
print("num_cols", num_cols)



**Numerical Features - Analysis**

In [None]:
def plot_histogram(df, column_name):

  plt.figure(figsize=(5, 3))
  sns.histplot(df[column_name], kde=True)
  plt.title(f"Distribution of {column_name}")

  col_mean = df[column_name].mean()
  col_median = df[column_name].median()

  # add vertical lines for mean and median
  plt.axvline(col_mean, color="red", linestyle="--", label="Mean")
  plt.axvline(col_median, color="green", linestyle="-", label="Median")

  plt.legend()

  plt.show()

In [None]:
plot_histogram(df, "tenure")

In [None]:
plot_histogram(df, "MonthlyCharges")

In [None]:
plot_histogram(df, "TotalCharges")

In [None]:
#box plot for numeric features 

def plot_boxplot(df, column_name):
    plt.figure(figsize=(5,3))
    sns.boxplot(y=df[column_name])
    plt.title(f'Distribution of {column_name}')
    plt.ylabel(column_name)
    plt.show()

In [None]:
plot_boxplot(df, "tenure")

In [None]:
plot_boxplot(df, "MonthlyCharges")

In [None]:
plot_boxplot(df, "TotalCharges")

In [None]:
#Correlation matrix - heatmap (for numerical features that tells us if there is a chance that could raise a multicoliearity issue)

plt.figure(figsize=(8,4))
sns.heatmap(df[["tenure","MonthlyCharges","TotalCharges"]].corr(), annot=True)


**Categorical features - Analysis**


In [None]:
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
# Separate features first
X = df.drop(columns=["Churn"])
y = df["Churn"]

cat_cols = X.select_dtypes(include=["object"]).columns

# One-Hot Encoding
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
X.head(2)


## Data Preprocessing

In [None]:
#label encoding of target column
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df.head(2)


In [None]:
print(df["Churn"].value_counts())

In [None]:
# Identify columns
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
# Separate features first
X = df.drop(columns=["Churn"])
y = df["Churn"]

cat_cols = X.select_dtypes(include=["object"]).columns

# One-Hot Encoding
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
X.head(2)


In [None]:
df

In [None]:
# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:

# Drop original target for training, keep features
X = df.drop(columns=["Churn"])
y = df["Churn"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Identify columns
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# Preprocessing block
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
    ])

# Define models
models = {
    "Logistic Regression": Pipeline(steps=[("preprocessor", preprocessor), ("smote", SMOTE()), ("classifier", LogisticRegression(max_iter=1000))]),
    "Decision Tree": Pipeline(steps=[("preprocessor", preprocessor), ("smote", SMOTE()), ("classifier", DecisionTreeClassifier())]),
    "XGBoost": Pipeline(steps=[("preprocessor", preprocessor), ("smote", SMOTE()), ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="logloss"))])
}


In [None]:
y_train.shape

In [None]:
y_train.value_counts()
# If you applied SMOTE using an imblearn Pipeline, SMOTE is ONLY applied *during* model fitting (pipeline.fit()).
# It does not change y_train permanently in memory here.
# To see the balanced data, you would have to apply SMOTE manually:
# from imblearn.over_sampling import SMOTE
# smote = SMOTE()
# X_train_sm, y_train_sm = smote.fit_resample(preprocessor.fit_transform(X_train), y_train)
# print("After SMOTE:", y_train_sm.value_counts())

## Model Training and Evaluation

In [None]:

# Train and evaluate models
best_model = None
best_acc = 0

for name, pipeline in models.items():
    print('Model:', name)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    if acc > best_acc:
        best_acc = acc
        best_model = pipeline

    # Plot Confusion matrix
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

import joblib
joblib.dump(best_model, "model_pipeline.pkl")
joblib.dump(X_train.columns.tolist(), "feature_columns.pkl")
