In [None]:

# Assignment ML9: Customer Churn Analysis (Telecom)
# Educational notebook. Expects './churn.csv' or will generate a small synthetic sample.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

path = "./churn.csv"
try:
    df = pd.read_csv(path)
    print("Loaded churn dataset from", path)
except Exception as e:
    print("Could not load './churn.csv' â€” generating synthetic sample. Error:", e)
    n = 1000
    np.random.seed(42)
    df = pd.DataFrame({
        "tenure": np.random.randint(1, 72, n),
        "MonthlyCharges": np.random.uniform(20, 120, n),
        "TotalCharges": np.random.uniform(20, 8000, n),
        "Contract": np.random.choice(["Month-to-month","One year","Two year"], n),
        "PaymentMethod": np.random.choice(["Electronic check","Mailed check","Bank transfer"], n),
        "Churn": np.random.choice(["Yes","No"], n, p=[0.2,0.8])
    })

display(df.head())

# Preprocess: simple encoding
df["Churn_bin"] = (df["Churn"]=="Yes").astype(int)
X = pd.get_dummies(df.drop(columns=["Churn","Churn_bin"]), drop_first=True)
y = df["Churn_bin"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
probs = clf.predict_proba(X_test)[:,1]

print("Classification report:\n", classification_report(y_test, pred))
print("ROC AUC:", roc_auc_score(y_test, probs))

fpr, tpr, _ = roc_curve(y_test, probs)
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.show()
