# Telco Churn Prediction using k-NN Algorithm

### STEP 1: Import libraries and Load Dataset

In [None]:
# Import libraries and dataset
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [None]:
# Load dataset
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()

In [None]:
# Inspect dataset
print("Shape: ", df.shape)
df.info()

##### Quick observations:
1. Mostly categorical features → need encoding.
2. Only a few numeric → scaling will be critical for k-NN.
3. TotalCharges is wrongly typed → needs cleaning.
4. customerID is just an identifier → drop it.
5. Dataset size (~7k) is OK for k-NN, but not huge.

### STEP 2: DATA PREPROCESSING AND EDA

In [None]:
# Target distribution
df["Churn"].value_counts(), df["Churn"].value_counts(normalize=True)

In [None]:
# Drop ID column
df = df.drop(columns="customerID")

In [None]:
# Convert TotalCharges to numeric and fix missing values
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

missing_totalcharges = df["TotalCharges"].isna().sum()
print("Missing TotalCharges count:", missing_totalcharges)

df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

print("Missing after fill:", df["TotalCharges"].isna().sum())


In [None]:
# Encode target as binary
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df["Churn"].value_counts(normalize=True)

In [None]:
# Visualize class balance on target variable
df["Churn"].value_counts().plot(kind="bar", title="Churn distribution")
plt.xlabel("Churn (0=No, 1=Yes)")
plt.ylabel("Count")
plt.show()

In [None]:
# Check numeric feature differences by churn
df.groupby("Churn")[["tenure", "MonthlyCharges", "TotalCharges"]].mean()


In [None]:
# Separate features and targets
X = df.drop(columns=["Churn"])
y = df["Churn"]

X.shape, y.shape

In [None]:
# Identify numeric vs categorical columns
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

print("numerical features are: ", num_features)
print("categorical features (count) are: ", len(cat_features))

#### STEP 4: Data Splitting

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Train churn rate:", y_train.mean())
print("Test churn rate:", y_test.mean())


#### STEP 5: Model definition

In [None]:
# Build data preprocessing pipelines for numeric and categorical data
# k-NN relies on distance 
# scaling numeric features is essential. Categorical must be encoded
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])


In [None]:
# Combine preprocessing with ColumnTransformer
preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_features),
        ("cat", cat_pipe, cat_features)
    ],
    remainder="drop"
)

In [None]:
# Create the full model pipeline
# object handles end-to-end preprocessing and modeling 
pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("knn", KNeighborsClassifier(n_neighbors=5, weights="distance"))
])

pipe

#### STEP 6: Cross-validation baseline (ROC-AUC) 

In [None]:
# Estimate model performance reliably on training data using cross-validation
cv_auc = cross_val_score(
    pipe,
    X_train,
    y_train,
    cv=5,
    scoring="roc_auc"
)

print("CV ROC-AUC mean:", cv_auc.mean())
print("CV ROC-AUC std:", cv_auc.std())

### STEP 7: Model Training

In [None]:
# Train on the training set and evaluate on held-out test data
pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_test)[:, 1]
y_pred = pipe.predict(X_test)

test_auc = roc_auc_score(y_test, y_proba)
print("Test ROC-AUC:", test_auc)

print("\nClassification report:")
print(classification_report(y_test, y_pred))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))
