In [1]:
! pip install scikit-learn pandas numpy joblib


Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os


In [3]:
# Step 2: Load train, validation, and test datasets
train_path = r"D:\telco-customer-churn\train.csv"
val_path   = r"D:\telco-customer-churn\validation.csv"
test_path  = r"D:\telco-customer-churn\test.csv"

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)

# Show dataset shapes
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

# Display first few rows of train data
train_df.head()


Train shape: (4225, 52)
Validation shape: (1409, 52)
Test shape: (1409, 52)


Unnamed: 0,Age,Avg Monthly GB Download,Avg Monthly Long Distance Charges,Churn Category,Churn Reason,Churn Score,City,CLTV,Contract,Country,...,Tenure in Months,Total Charges,Total Extra Data Charges,Total Long Distance Charges,Total Refunds,Total Revenue,Under 30,Unlimited Data,Zip Code,Churn
0,72,4,19.44,,,51,San Mateo,4849,Two Year,United States,...,25,2191.15,0,486.0,0.0,2677.15,0,1,94403,0
1,27,59,45.62,,,27,Sutter Creek,3715,Month-to-Month,United States,...,35,3418.2,0,1596.7,0.0,5014.9,1,1,95685,0
2,59,0,16.07,,,59,Santa Cruz,5092,Month-to-Month,United States,...,46,851.2,0,739.22,0.0,1590.42,0,0,95064,0
3,25,27,0.0,,,49,Brea,2068,One Year,United States,...,27,1246.4,30,0.0,0.0,1276.4,1,0,92823,0
4,31,21,17.22,Dissatisfaction,Network reliability,88,San Jose,4026,One Year,United States,...,58,3563.8,0,998.76,0.0,4562.56,0,1,95117,1


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Target column (based on Telco dataset it's usually "Churn")
target_col = "Churn"

# Separate features (X) and target (y)
X_train, y_train = train_df.drop(columns=[target_col]), train_df[target_col]
X_val, y_val     = val_df.drop(columns=[target_col]), val_df[target_col]
X_test, y_test   = test_df.drop(columns=[target_col]), test_df[target_col]

# Identify categorical & numerical columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
numerical_cols   = X_train.select_dtypes(include=np.number).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

# Preprocessing: OneHotEncode categorical + Scale numerical
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)


Categorical columns: ['Churn Category', 'Churn Reason', 'City', 'Contract', 'Country', 'Customer ID', 'Customer Status', 'Gender', 'Internet Type', 'Lat Long', 'Offer', 'Payment Method', 'Quarter', 'State']
Numerical columns: ['Age', 'Avg Monthly GB Download', 'Avg Monthly Long Distance Charges', 'Churn Score', 'CLTV', 'Dependents', 'Device Protection Plan', 'Internet Service', 'Latitude', 'Longitude', 'Married', 'Monthly Charge', 'Multiple Lines', 'Number of Dependents', 'Number of Referrals', 'Online Backup', 'Online Security', 'Paperless Billing', 'Partner', 'Phone Service', 'Population', 'Premium Tech Support', 'Referred a Friend', 'Satisfaction Score', 'Senior Citizen', 'Streaming Movies', 'Streaming Music', 'Streaming TV', 'Tenure in Months', 'Total Charges', 'Total Extra Data Charges', 'Total Long Distance Charges', 'Total Refunds', 'Total Revenue', 'Under 30', 'Unlimited Data', 'Zip Code']


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Logistic Regression pipeline
log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Train the pipeline
log_reg_pipeline.fit(X_train, y_train)

# Validate on validation set
y_val_pred = log_reg_pipeline.predict(X_val)

# Show performance
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))


Validation Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1035
           1       1.00      1.00      1.00       374

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409



In [6]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the pipeline
rf_pipeline.fit(X_train, y_train)

# Validate on validation set
y_val_pred_rf = rf_pipeline.predict(X_val)

# Show performance
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred_rf))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred_rf))


Validation Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1035
           1       1.00      1.00      1.00       374

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409



In [7]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression
param_grid_lr = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__solver": ["liblinear", "lbfgs"]
}

# Pipeline with preprocessing + Logistic Regression
log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# GridSearch
grid_lr = GridSearchCV(log_reg_pipeline, param_grid_lr, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
grid_lr.fit(X_train, y_train)

print("Best Params (Logistic Regression):", grid_lr.best_params_)
print("Best CV Accuracy:", grid_lr.best_score_)

# Validate on validation set
y_val_pred_lr = grid_lr.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred_lr))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred_lr))


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params (Logistic Regression): {'classifier__C': 0.1, 'classifier__solver': 'liblinear'}
Best CV Accuracy: 1.0
Validation Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1035
           1       1.00      1.00      1.00       374

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409



In [8]:
# Define parameter grid for Random Forest
param_grid_rf = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5]
}

# Pipeline with preprocessing + Random Forest
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# GridSearch
grid_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
grid_rf.fit(X_train, y_train)

print("Best Params (Random Forest):", grid_rf.best_params_)
print("Best CV Accuracy:", grid_rf.best_score_)

# Validate on validation set
y_val_pred_rf = grid_rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred_rf))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred_rf))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params (Random Forest): {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best CV Accuracy: 1.0
Validation Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1035
           1       1.00      1.00      1.00       374

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409



In [9]:
import joblib

# Compare validation performance and choose the best model
best_model = grid_lr if grid_lr.best_score_ >= grid_rf.best_score_ else grid_rf
print("Selected Best Model:", type(best_model.best_estimator_["classifier"]).__name__)

# Evaluate on test set
y_test_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report (Test Set):\n", classification_report(y_test, y_test_pred))

# Save the best pipeline
joblib.dump(best_model.best_estimator_, "best_churn_model.pkl")
print("✅ Model exported as best_churn_model.pkl")


Selected Best Model: LogisticRegression
Test Accuracy: 1.0

Classification Report (Test Set):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1035
           1       1.00      1.00      1.00       374

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409

✅ Model exported as best_churn_model.pkl
