In [2]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [3]:
DATA_PATH = "D:/CSE(DataScience)/Customer_Churn_Prediction/Dataset/Telco-Customer-Churn-dataset.csv"
SAVE_DIR = "D:/CSE(DataScience)/Customer_Churn_Prediction/models"
os.makedirs(SAVE_DIR, exist_ok=True)
RANDOM_STATE = 42

In [4]:
df = pd.read_csv(r"D:\CSE(DataScience)\Customer_Churn_Prediction\Datasets\Telco-Customer-Churn-dataset.csv")
print("Original shape:", df.shape)

Original shape: (7043, 21)


In [5]:
if "customerID" in df.columns:
    df = df.drop("customerID", axis=1)

In [6]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")


In [7]:
df = df.dropna().reset_index(drop=True)
print("Shape after cleaning:", df.shape)

Shape after cleaning: (7032, 20)


In [8]:
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

In [9]:
# Separate X,y
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [10]:
# Identify numerical and categorical features programmatically
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [11]:
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Numeric features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [12]:
# ---------- 3. Train/Test split (stratified) ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

In [15]:
# ---------- 4. Preprocessor (ColumnTransformer) ----------
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder='drop',
    sparse_threshold=0
)

In [16]:
# Define pipelines for each model
pipelines = {
    "knn": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", KNeighborsClassifier())
    ]),
    "svm": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", SVC(probability=True))
    ]),
    "logistic_regression": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000))
    ])
}

In [17]:
for key, pipe in pipelines.items():
    print("\nTraining:", key)
    pipe.fit(X_train, y_train)
    
    # Evaluate on test
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{key} Test Accuracy: {acc:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    # Save pipeline
    filename = os.path.join(SAVE_DIR, f"churn_pipeline_{key}.pkl")
    joblib.dump(pipe, filename)
    print("Saved pipeline to:", filename)

print("\nAll pipelines trained and saved.")


Training: knn
knn Test Accuracy: 0.7612
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.83      0.84      1033
           1       0.55      0.58      0.56       374

    accuracy                           0.76      1407
   macro avg       0.70      0.70      0.70      1407
weighted avg       0.77      0.76      0.76      1407

Saved pipeline to: D:/CSE(DataScience)/Customer_Churn_Prediction/models\churn_pipeline_knn.pkl

Training: svm
svm Test Accuracy: 0.7918
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.64      0.49      0.56       374

    accuracy                           0.79      1407
   macro avg       0.74      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407

Saved pipeline to: D:/CSE(DataScience)/Customer_Churn_Prediction/models\churn_pipeline_svm.pkl

Training: logistic_regres