<a href="https://colab.research.google.com/github/Davansh09/Data-science-projects/blob/main/Customer_Pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

np.random.seed(7)
n = 200
customer_id = np.arange(1, n + 1)
tenure_months = np.random.randint(1, 73, size=n)
monthly_charges = np.round(np.random.uniform(20, 120, size=n), 2)
support_calls = np.random.randint(0, 11, size=n)
contract = np.random.choice(["month_to_month", "one_year", "two_year"], size=n, p=[0.55, 0.3, 0.15])
is_promo = np.random.choice([0, 1], size=n, p=[0.6, 0.4])
payment_method = np.random.choice(["card", "upi", "cash"], size=n, p=[0.5, 0.35, 0.15])
logit = -2 + 0.03 * (120 - tenure_months) + 0.04 * (monthly_charges - 50) + 0.25 * support_calls + 0.55 * (contract == "month_to_month") + 0.25 * (is_promo == 0) + 0.15 * (payment_method == "cash")
prob = 1 / (1 + np.exp(-logit))
churn = np.random.binomial(1, prob, size=n)

df = pd.DataFrame({
    "customer_id": customer_id,
    "tenure_months": tenure_months,
    "monthly_charges": monthly_charges,
    "support_calls": support_calls,
    "contract": contract,
    "is_promo": is_promo,
    "payment_method": payment_method,
    "churn": churn
})

X = df.drop(columns=["churn", "customer_id"])
y = df["churn"]

num_cols = ["tenure_months", "monthly_charges", "support_calls"]
cat_cols = ["contract", "is_promo", "payment_method"]

X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(X, y, df["customer_id"], test_size=0.25, random_state=42, stratify=y)

pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

clf = Pipeline([
    ("prep", pre),
    ("lr", LogisticRegression(max_iter=1000))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
cm = confusion_matrix(y_test, y_pred)

print("Metrics")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1: {f1:.3f}")
print(f"ROC-AUC: {auc:.3f}")
print("Confusion Matrix:")
print(cm)

test_out = X_test.copy()
test_out["customer_id"] = id_test.values
test_out["churn_prob"] = y_proba
test_out["pred"] = y_pred
top_risky = test_out.sort_values("churn_prob", ascending=False).head(10)[["customer_id", "churn_prob", "pred"]]
print("\nTop at-risk customers (probability):")
for _, r in top_risky.iterrows():
    print(f"{int(r['customer_id'])}: {r['churn_prob']:.3f}  pred={int(r['pred'])}")

ohe = clf.named_steps["prep"].named_transformers_["cat"]
num_names = num_cols
cat_names = ohe.get_feature_names_out(cat_cols).tolist()
feat_names = np.array(num_names + cat_names)
coefs = clf.named_steps["lr"].coef_.flatten()
imp = pd.DataFrame({"feature": feat_names, "coef": coefs}).sort_values("coef", ascending=False)
print("\nTop positive drivers:")
print(imp.head(8).to_string(index=False))
print("\nTop negative drivers:")
print(imp.tail(8).to_string(index=False))


Metrics
Accuracy: 0.920
Precision: 0.918
Recall: 1.000
F1: 0.957
ROC-AUC: 0.916
Confusion Matrix:
[[ 1  4]
 [ 0 45]]

Top at-risk customers (probability):
9: 0.998  pred=1
8: 0.998  pred=1
178: 0.997  pred=1
118: 0.997  pred=1
188: 0.996  pred=1
190: 0.995  pred=1
2: 0.992  pred=1
123: 0.992  pred=1
54: 0.991  pred=1
4: 0.990  pred=1

Top positive drivers:
                feature      coef
        monthly_charges  1.187065
          support_calls  0.783931
      contract_one_year  0.543183
             is_promo_0  0.419812
    payment_method_card  0.113367
     payment_method_upi  0.026156
contract_month_to_month -0.051727
          tenure_months -0.078991

Top negative drivers:
                feature      coef
             is_promo_0  0.419812
    payment_method_card  0.113367
     payment_method_upi  0.026156
contract_month_to_month -0.051727
          tenure_months -0.078991
    payment_method_cash -0.137373
             is_promo_1 -0.417662
      contract_two_year -0.489306
