In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score

print("Libraries imported successfully.")

Libraries imported successfully.


In [3]:
# Try loading from 'data/' folder (local structure) or root (Colab structure)
try:
    df_billing = pd.read_csv("data/customer_billing_churn.csv")
    df_profile = pd.read_csv("data/customer_profile.csv")
    print("Loaded files from 'data/' directory.")
except FileNotFoundError:
    df_billing = pd.read_csv("customer_billing_churn.csv")
    df_profile = pd.read_csv("customer_profile.csv")
    print("Loaded files from root directory.")

print("Billing Data Shape:", df_billing.shape)
print("Profile Data Shape:", df_profile.shape)

Loaded files from root directory.
Billing Data Shape: (7043, 7)
Profile Data Shape: (7043, 15)


In [4]:
# 1. Convert TotalCharges to numeric, coerce errors to NaN, fill with 0
df_billing['TotalCharges'] = pd.to_numeric(df_billing['TotalCharges'], errors='coerce').fillna(0)

# 2. Create features
EPS = 1e-6
df_billing['AvgMonthlySpend'] = df_billing['TotalCharges'] / (df_billing['tenure'] + EPS)
df_billing['TenureChargeInteraction'] = df_billing['MonthlyCharges'] * df_billing['tenure']

# 3. Tenure Stability Indicator
df_billing['ShortTenureFlag'] = df_billing['tenure'].apply(lambda x: 1 if x < 12 else 0)

print("Data preparation complete. New features created.")

Data preparation complete. New features created.


In [5]:
# 1. Combine on customerID
df_merged = pd.merge(df_billing, df_profile, on='customerID')

# 2. Encode Churn
df_merged['Churn'] = df_merged['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# 3. One-hot encode categorical variables
# Identify categorical columns (excluding customerID)
cat_cols = df_merged.select_dtypes(include=['object']).columns.tolist()
if 'customerID' in cat_cols:
    cat_cols.remove('customerID')

# Apply get_dummies
churn_model_input = pd.get_dummies(df_merged, columns=cat_cols, drop_first=False, dtype=int)

print("Integration complete. Final dataset shape:", churn_model_input.shape)

Integration complete. Final dataset shape: (7043, 50)


In [7]:
# 1. Drop customerID
if 'customerID' in churn_model_input.columns:
    churn_model_input = churn_model_input.drop(columns=['customerID'])

# 2. Ensure no missing values
churn_model_input = churn_model_input.dropna()

# 3. Split Dataset
X = churn_model_input.drop(columns=['Churn'])
y = churn_model_input['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

Training set: (4930, 48), Test set: (2113, 48)


In [8]:
# Fit Random Forest
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

# Calculate Metrics
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Model Trained.\nF1 Score: {f1:.5f}\nAUC Score: {auc:.5f}")

Model Trained.
F1 Score: 0.57889
AUC Score: 0.83881


In [9]:
# 1. Feature Importance Dictionary
importances = clf.feature_importances_
feature_names = X.columns
feature_importance_dict = {
    feat: round(imp, 5)
    for feat, imp in zip(feature_names, importances)
}

# 2. Model Metrics
model_metrics = {
    "f1_score": round(f1, 5),
    "auc_score": round(auc, 5)
}

# 3. Churn Counts
unique, counts = np.unique(y_pred, return_counts=True)
churn_counts_df = pd.DataFrame({'class': unique, 'count': counts})

# Serialization
churn_counts = churn_counts_df.to_dict(orient='split')

print("\n--- DELIVERABLES ---")
print("model_metrics =", model_metrics)
print("churn_counts =", churn_counts)
print("feature_importance_dict (first 5) =", list(feature_importance_dict.items())[:5])


--- DELIVERABLES ---
model_metrics = {'f1_score': 0.57889, 'auc_score': np.float64(0.83881)}
churn_counts = {'index': [0, 1], 'columns': ['class', 'count'], 'data': [[0, 1692], [1, 421]]}
feature_importance_dict (first 5) = [('tenure', np.float64(0.09033)), ('MonthlyCharges', np.float64(0.09492)), ('TotalCharges', np.float64(0.1091)), ('AvgMonthlySpend', np.float64(0.0997)), ('TenureChargeInteraction', np.float64(0.10293))]


In [10]:
# Verify against expected values from the test script
expected_f1 = 0.57889
expected_auc = 0.83881

print("--- VERIFICATION ---")
if abs(model_metrics['f1_score'] - expected_f1) < 0.001:
    print("✅ F1 Score is correct!")
else:
    print(f"❌ F1 Score mismatch. Got {model_metrics['f1_score']}, Expected ~{expected_f1}")

if abs(model_metrics['auc_score'] - expected_auc) < 0.001:
    print("✅ AUC Score is correct!")
else:
    print(f"❌ AUC Score mismatch. Got {model_metrics['auc_score']}, Expected ~{expected_auc}")

--- VERIFICATION ---
✅ F1 Score is correct!
✅ AUC Score is correct!
