In [41]:
# =============================
# Telco Customer Churn Project
# =============================

# --- Imports ---
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE


# -----------------------------
# Load dataset
# -----------------------------
print("\n--- Loading Dataset ---")
df = pd.read_csv("Telco_Customer_Churn.csv")
print(df.info())
print(df.head())


# -----------------------------
# Data Cleaning
# -----------------------------
print("\n--- Data Cleaning ---")

# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Handle missing values
df["TotalCharges"].fillna(df["TotalCharges"].mean(), inplace=True)

# Replace "No phone service" with "No"
df.replace({"No phone service": "No"}, inplace=True)

# Replace "No internet service" with "No" in related columns
internet_cols = ["OnlineSecurity", "OnlineBackup", "DeviceProtection",
                 "TechSupport", "StreamingTV", "StreamingMovies"]
for col in internet_cols:
    df[col].replace("No internet service", "No", inplace=True)

print("Remaining nulls in TotalCharges:", df["TotalCharges"].isnull().sum())


# -----------------------------
# Outlier Detection (Optional)
# -----------------------------
print("\n--- Outlier Detection ---")
num_cols = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]
for col in num_cols:
    z = np.abs(stats.zscore(df[col]))
    if (z > 3).any():
        print(f"Possible outliers in {col}")
print("End of outlier check.")


# -----------------------------
# Encode Categorical Variables
# -----------------------------
X = df.drop(["Churn", "customerID"], axis=1)
y = LabelEncoder().fit_transform(df["Churn"])

for col in X.columns:
    if X[col].dtype == "object":
        X[col] = LabelEncoder().fit_transform(X[col])

print("\nEncoded feature sample:")
print(X.head())


# -----------------------------
# Train-test split (before SMOTE)
# -----------------------------
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# -----------------------------
# Apply SMOTE on training set
# -----------------------------
print("\n--- Applying SMOTE ---")
smt = SMOTE(random_state=42)
x_train_res, y_train_res = smt.fit_resample(x_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE :", np.bincount(y_train_res))


# -----------------------------
# Model comparison (Cross-validation on resampled data)
# -----------------------------
print("\n--- Cross Validation Scores (on balanced training set) ---")
print("Logistic Regression:", cross_val_score(LogisticRegression(solver="liblinear"), x_train_res, y_train_res, cv=3).mean())
print("SVC:", cross_val_score(SVC(), x_train_res, y_train_res, cv=3).mean())
print("Random Forest:", cross_val_score(RandomForestClassifier(n_estimators=50), x_train_res, y_train_res, cv=3).mean())


# -----------------------------
# Grid Search for Random Forest
# -----------------------------
print("\n--- Hyperparameter Tuning (Random Forest) ---")
param_grid = {
    "n_estimators": [50, 100, 150],
    "max_features": ["sqrt", "log2", None],
    "max_depth": [2, 4, 6]
}
grido = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grido.fit(x_train_res, y_train_res)
print("Best Params:", grido.best_params_)


# -----------------------------
# Final Random Forest Model
# -----------------------------
best_model = RandomForestClassifier(**grido.best_params_)
best_model.fit(x_train_res, y_train_res)

# --- Evaluation on untouched test set ---
y_pred = best_model.predict(x_test)
print("\n--- Final Model Performance (Test Set) ---")
print(classification_report(y_test, y_pred))




--- Loading Dataset ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(df["TotalCharges"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].replace("No internet service", "No", inplace=True)


Logistic Regression: 0.7791871310931463
SVC: 0.6360224684081784
Random Forest: 0.8104751161328844

--- Hyperparameter Tuning (Random Forest) ---
Best Params: {'max_depth': 6, 'max_features': None, 'n_estimators': 50}

--- Final Model Performance (Test Set) ---
              precision    recall  f1-score   support

           0       0.88      0.78      0.82      1035
           1       0.53      0.70      0.60       374

    accuracy                           0.76      1409
   macro avg       0.70      0.74      0.71      1409
weighted avg       0.78      0.76      0.77      1409

