In [1]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle

# ---------------------
# 1. LOAD DATA
# ---------------------
train_path = "../data/customer_churn_dataset-training-master.csv"
test_path  = "../data/customer_churn_dataset-testing-master.csv"

print("Train path:", os.path.abspath(train_path))
print("Test path :", os.path.abspath(test_path))

train_data = pd.read_csv(train_path)
test_data  = pd.read_csv(test_path)

print("Original train shape:", train_data.shape)
print("Original test shape:", test_data.shape)

target_col = "Churn"   # change if your target column has different name

# ---------------------
# 2. DROP ROWS WHERE TARGET (Churn) IS NaN
# ---------------------
train_data = train_data.dropna(subset=[target_col])
test_data  = test_data.dropna(subset=[target_col])

print("After dropping NaN Churn -> train shape:", train_data.shape)
print("After dropping NaN Churn -> test shape:", test_data.shape)

# ---------------------
# 3. DROP TARGET + CUSTOMERID, KEEP ONLY NUMERIC FEATURES
# ---------------------
cols_to_drop = [target_col]
if "CustomerID" in train_data.columns:
    cols_to_drop.append("CustomerID")

X_train_full = train_data.drop(columns=cols_to_drop)
y_train = train_data[target_col]

X_test_full = test_data.drop(columns=cols_to_drop)
y_test = test_data[target_col]

numeric_cols = X_train_full.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("\nUsing numeric columns only (without CustomerID):")
print(numeric_cols)

X_train = X_train_full[numeric_cols].copy()
X_test  = X_test_full[numeric_cols].copy()

print("X_train numeric shape:", X_train.shape)
print("X_test numeric shape :", X_test.shape)

# ---------------------
# 4. IMPUTE NUMERIC + SCALE
# ---------------------
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()

X_train_imputed = num_imputer.fit_transform(X_train)
X_test_imputed  = num_imputer.transform(X_test)

X_train_scaled = num_scaler.fit_transform(X_train_imputed)
X_test_scaled  = num_scaler.transform(X_test_imputed)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=numeric_cols, index=X_train.index)
X_test_scaled  = pd.DataFrame(X_test_scaled, columns=numeric_cols, index=X_test.index)

print("Preprocessing done.")

# ---------------------
# 5. TRAIN RANDOM FOREST
# ---------------------
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)

rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred_rf)
cm = confusion_matrix(y_test, y_pred_rf)
report = classification_report(y_test, y_pred_rf)

print("\n=== Random Forest (numeric-only) Results ===")
print("Accuracy:", acc)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)

# ---------------------
# 6. SAVE MODEL + METRICS + METADATA
# ---------------------
os.makedirs("../models", exist_ok=True)
model_path = "../models/churn_model.pkl"

with open(model_path, "wb") as f:
    pickle.dump(
        {
            "model": rf_model,
            "num_imputer": num_imputer,
            "num_scaler": num_scaler,
            "numeric_cols": numeric_cols,
            # keep these for compatibility with app.py (empty for now)
            "categorical_cols": [],
            "cat_dummy_columns": [],
            "accuracy": float(acc),
            "confusion_matrix": cm,
            "report": report,
        },
        f
    )

print("\nModel saved at:", os.path.abspath(model_path))
print("File size (bytes):", os.path.getsize(model_path))


Train path: c:\Users\ASSIM\Python Development\Customer Churn Prediction project\data\customer_churn_dataset-training-master.csv
Test path : c:\Users\ASSIM\Python Development\Customer Churn Prediction project\data\customer_churn_dataset-testing-master.csv
Original train shape: (440833, 12)
Original test shape: (64374, 12)
After dropping NaN Churn -> train shape: (440832, 12)
After dropping NaN Churn -> test shape: (64374, 12)

Using numeric columns only (without CustomerID):
['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']
X_train numeric shape: (440832, 7)
X_test numeric shape : (64374, 7)
Preprocessing done.

=== Random Forest (numeric-only) Results ===
Accuracy: 0.5334607139528381
Confusion Matrix:
 [[ 3919 29962]
 [   71 30422]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.12      0.21     33881
           1       0.50      1.00      0.67     30493

    accuracy 