In [1]:
# Cell 1 — Setup (Colab)
# If prompted, choose "Run anyway". Then upload the dataset.
!pip -q install xgboost==1.7.6

from google.colab import files
print("Upload Churn_Modelling.csv when the chooser opens.")
uploaded = files.upload()  # choose Churn_Modelling.csv


Upload Churn_Modelling.csv when the chooser opens.


KeyboardInterrupt: 

In [2]:
# Cell 2 — Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [3]:
# Cell 3 — Load & light cleaning
RAW_PATH = "Churn_Modelling.csv"  # uploaded in cell 1

raw = pd.read_csv(RAW_PATH)

# Drop pure identifiers only
data = raw.drop(columns=["RowNumber", "CustomerId", "Surname"], errors="ignore")

# Ensure expected columns exist
assert "Exited" in data.columns, "Target column 'Exited' missing."

# Keep a copy for later joins (for pretty Power BI fields)
raw_for_export = data.copy()


In [4]:
# Cell 4 — Feature engineering (different from your friend’s code)
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # 1) Income-related ratios (guard against div-by-zero)
    X["Bal_to_Salary"] = np.where(X["EstimatedSalary"] > 0, X["Balance"] / X["EstimatedSalary"], 0)

    # 2) Tenure buckets
    X["TenureBand"] = pd.cut(
        X["Tenure"], bins=[-1, 2, 5, 10], labels=["0-2", "3-5", "6-10"]
    )

    # 3) Product intensity
    X["MultiProduct"] = (X["NumOfProducts"] >= 2).astype(int)

    # 4) Interaction: CreditScore x IsActiveMember
    X["ScoreActive"] = X["CreditScore"] * X["IsActiveMember"]

    # 5) High balance flag
    X["HighBalanceFlag"] = (X["Balance"] > X["Balance"].median()).astype(int)

    return X

feat = make_features(data)
y = feat["Exited"].astype(int)
X = feat.drop(columns=["Exited"])


In [5]:
# Cell 5 — Split & preprocessing
# Identify types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

numeric = StandardScaler()
categorical = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

pre = ColumnTransformer(
    transformers=[
        ("num", numeric, num_cols),
        ("cat", categorical, cat_cols),
    ],
    remainder="drop",
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=2025, stratify=y
)


In [6]:
# Cell 6 — Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=2025),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, max_depth=None, min_samples_split=4, random_state=2025
    ),
    "XGBoost": XGBClassifier(
        n_estimators=400, max_depth=4, learning_rate=0.08,
        subsample=0.9, colsample_bytree=0.9,
        reg_lambda=1.0, reg_alpha=0.0,
        eval_metric="logloss", random_state=2025
    ),
}


In [7]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=2025),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, min_samples_split=4, random_state=2025
    ),
    "XGBoost": XGBClassifier(
        n_estimators=400,
        max_depth=4,
        learning_rate=0.08,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=2025,
    ),
}

print("Models to train:", list(models.keys()))


Models to train: ['Logistic Regression', 'Random Forest', 'XGBoost']




In [8]:
results = {}
trained = {}

for name, clf in models.items():
    print(f"\nTraining {name} ...")
    pipe = Pipeline([("pre", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)
    yhat = pipe.predict(X_test)
    proba = pipe.predict_proba(X_test)[:, 1]

    results[name] = {
        "Accuracy": accuracy_score(y_test, yhat),
        "Precision": precision_score(y_test, yhat, zero_division=0),
        "Recall": recall_score(y_test, yhat, zero_division=0),
        "F1": f1_score(y_test, yhat, zero_division=0),
        "ROC_AUC": roc_auc_score(y_test, proba),
    }
    trained[name] = pipe
    print(f"{name} done. ROC_AUC: {results[name]['ROC_AUC']:.4f}")

metrics_df = pd.DataFrame(results).T.sort_values("ROC_AUC", ascending=False)
print("\nModel leaderboard:")
display(metrics_df)

best_name = metrics_df.index[0]
best_pipe = trained[best_name]
print("Best model:", best_name)



Training Logistic Regression ...
Logistic Regression done. ROC_AUC: 0.8362

Training Random Forest ...
Random Forest done. ROC_AUC: 0.8569

Training XGBoost ...
XGBoost done. ROC_AUC: 0.8544

Model leaderboard:


Unnamed: 0,Accuracy,Precision,Recall,F1,ROC_AUC
Random Forest,0.869,0.777778,0.498771,0.607784,0.856868
XGBoost,0.868,0.75265,0.523342,0.617391,0.85438
Logistic Regression,0.85,0.721992,0.427518,0.537037,0.83624


Best model: Random Forest


In [9]:
def get_feature_names(preprocessor: ColumnTransformer, num_cols, cat_cols):
    names = []
    if "num" in preprocessor.named_transformers_:
        names.extend(list(num_cols))
    if "cat" in preprocessor.named_transformers_:
        ohe = preprocessor.named_transformers_["cat"]
        ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
        names.extend(ohe_names)
    return names

# Logistic Regression
if "Logistic Regression" in trained:
    log_pipe = trained["Logistic Regression"]
    coefs = log_pipe.named_steps["clf"].coef_[0]
    feat_names_lr = get_feature_names(log_pipe.named_steps["pre"], num_cols, cat_cols)
    lr_df = pd.DataFrame({"feature": feat_names_lr, "coefficient": coefs})
    lr_df["abs_coefficient"] = lr_df["coefficient"].abs()
    lr_df = lr_df.sort_values("abs_coefficient", ascending=False)
    lr_df.to_csv("feature_importance_logistic_regression.csv", index=False)
    print("Exported feature_importance_logistic_regression.csv")

# Random Forest + XGBoost
for tree_name in ["Random Forest", "XGBoost"]:
    if tree_name in trained:
        pipe = trained[tree_name]
        importances = pipe.named_steps["clf"].feature_importances_
        names = get_feature_names(pipe.named_steps["pre"], num_cols, cat_cols)
        imp_df = pd.DataFrame({"feature": names, "importance": importances})
        imp_df = imp_df.sort_values("importance", ascending=False)
        out = f"feature_importance_{tree_name.replace(' ', '_').lower()}.csv"
        imp_df.to_csv(out, index=False)
        print("Exported", out)


Exported feature_importance_logistic_regression.csv
Exported feature_importance_random_forest.csv
Exported feature_importance_xgboost.csv


In [12]:
import sklearn, xgboost
print("sklearn:", sklearn.__version__)
print("xgboost:", xgboost.__version__)


sklearn: 1.5.2
xgboost: 1.7.6


In [13]:
# Cell 7 — define models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=2025),
    "Random Forest": RandomForestClassifier(n_estimators=300, min_samples_split=4, random_state=2025),
    "XGBoost": XGBClassifier(n_estimators=400, max_depth=4, learning_rate=0.08,
                             subsample=0.9, colsample_bytree=0.9, eval_metric="logloss",
                             use_label_encoder=False, random_state=2025),
}

print("Models to train:", list(models.keys()))


Models to train: ['Logistic Regression', 'Random Forest', 'XGBoost']




In [14]:
# Cell 8 — train, evaluate, and show metrics
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

results = {}
trained = {}

for name, clf in models.items():
    try:
        print(f"\n--- Training {name} ---")
        pipe = Pipeline([("pre", pre), ("clf", clf)])   # 'pre' must exist from earlier cell 6
        pipe.fit(X_train, y_train)
        yhat = pipe.predict(X_test)
        proba = pipe.predict_proba(X_test)[:, 1]

        results[name] = {
            "Accuracy": accuracy_score(y_test, yhat),
            "Precision": precision_score(y_test, yhat, zero_division=0),
            "Recall": recall_score(y_test, yhat, zero_division=0),
            "F1": f1_score(y_test, yhat, zero_division=0),
            "ROC_AUC": roc_auc_score(y_test, proba),
        }
        trained[name] = pipe
        print(f"{name} done. ROC_AUC: {results[name]['ROC_AUC']:.4f}")
    except Exception as e:
        print(f"ERROR while training {name}: {type(e).__name__}: {e}")
        # break so you can inspect and paste the error here
        break

metrics_df = pd.DataFrame(results).T.sort_values("ROC_AUC", ascending=False)
print("\nModel leaderboard (sorted by ROC_AUC):")
display(metrics_df.style.format({c: "{:.4f}" for c in metrics_df.columns}))

if len(metrics_df)>0:
    best_name = metrics_df.index[0]
    best_pipe = trained[best_name]
    print("Best model:", best_name)
else:
    print("No models completed successfully.")



--- Training Logistic Regression ---
Logistic Regression done. ROC_AUC: 0.8362

--- Training Random Forest ---
Random Forest done. ROC_AUC: 0.8569

--- Training XGBoost ---
XGBoost done. ROC_AUC: 0.8544

Model leaderboard (sorted by ROC_AUC):


Unnamed: 0,Accuracy,Precision,Recall,F1,ROC_AUC
Random Forest,0.869,0.7778,0.4988,0.6078,0.8569
XGBoost,0.868,0.7527,0.5233,0.6174,0.8544
Logistic Regression,0.85,0.722,0.4275,0.537,0.8362


Best model: Random Forest


In [16]:
# Cell 9 — helper to get feature names after preprocessing
def get_feature_names(preprocessor, num_cols, cat_cols):
    names = []
    if "num" in preprocessor.named_transformers_:
        names.extend(list(num_cols))
    if "cat" in preprocessor.named_transformers_:
        ohe = preprocessor.named_transformers_["cat"]
        ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
        names.extend(ohe_names)
    return names

# print top features for each trained model
for name, pipe in trained.items():
    print(f"\nTop features for {name}:")
    try:
        feat_names = get_feature_names(pipe.named_steps["pre"], num_cols, cat_cols)
        if name == "Logistic Regression":
            coefs = pipe.named_steps["clf"].coef_[0]
            tmp = pd.DataFrame({"feature": feat_names, "value": coefs})
            tmp["abs"] = tmp["value"].abs()
            tmp = tmp.sort_values("abs", ascending=False).head(10).drop(columns="abs")
            display(tmp.reset_index(drop=True))
        else:
            importances = pipe.named_steps["clf"].feature_importances_
            tmp = pd.DataFrame({"feature": feat_names, "importance": importances})
            display(tmp.sort_values("importance", ascending=False).head(10).reset_index(drop=True))
    except Exception as e:
        print("Could not extract features for", name, "->", e)



Top features for Logistic Regression:


Unnamed: 0,feature,value
0,MultiProduct,-2.752851
1,NumOfProducts,2.313823
2,Age,0.750907
3,Gender_Male,-0.665498
4,Geography_France,-0.590766
5,Geography_Spain,-0.501968
6,TenureBand_6-10,-0.395618
7,Geography_Germany,0.325655
8,ScoreActive,-0.305282
9,IsActiveMember,-0.264661



Top features for Random Forest:


Unnamed: 0,feature,importance
0,Age,0.230207
1,NumOfProducts,0.106603
2,EstimatedSalary,0.098775
3,CreditScore,0.098566
4,Balance,0.096409
5,Bal_to_Salary,0.08281
6,ScoreActive,0.064372
7,Tenure,0.050975
8,MultiProduct,0.032926
9,Geography_Germany,0.021547



Top features for XGBoost:


Unnamed: 0,feature,importance
0,NumOfProducts,0.150595
1,Age,0.099815
2,Gender_Male,0.09922
3,MultiProduct,0.095049
4,Geography_Germany,0.074553
5,HighBalanceFlag,0.063968
6,IsActiveMember,0.063966
7,ScoreActive,0.061983
8,Gender_Female,0.045295
9,Balance,0.031944


In [17]:
# Cell 10 — export CSVs
import os

# 1) logistic coefficients CSV
if "Logistic Regression" in trained:
    log_pipe = trained["Logistic Regression"]
    feat_names_lr = get_feature_names(log_pipe.named_steps["pre"], num_cols, cat_cols)
    coefs = log_pipe.named_steps["clf"].coef_[0]
    lr_df = pd.DataFrame({"feature": feat_names_lr, "coefficient": coefs})
    lr_df["abs_coefficient"] = lr_df["coefficient"].abs()
    lr_df = lr_df.sort_values("abs_coefficient", ascending=False)
    lr_df.to_csv("feature_importance_logistic_regression.csv", index=False)
    print("Saved feature_importance_logistic_regression.csv (rows):", lr_df.shape[0])
    display(lr_df.head())

# 2) Random Forest and XGBoost importances
for tree_name in ["Random Forest", "XGBoost"]:
    if tree_name in trained:
        pipe = trained[tree_name]
        feat_names = get_feature_names(pipe.named_steps["pre"], num_cols, cat_cols)
        imps = pipe.named_steps["clf"].feature_importances_
        imp_df = pd.DataFrame({"feature": feat_names, "importance": imps}).sort_values("importance", ascending=False)
        outname = f"feature_importance_{tree_name.replace(' ', '_').lower()}.csv"
        imp_df.to_csv(outname, index=False)
        print("Saved", outname, "rows:", imp_df.shape[0])
        display(imp_df.head())

# 3) customer_churn_insights.csv
nice_cols = [c for c in ["Geography","Gender","Age","Tenure","Balance","NumOfProducts","HasCrCard","IsActiveMember","EstimatedSalary"] if c in raw.columns]
test_indices = X_test.index
nice_subset = raw.loc[test_indices, nice_cols].reset_index(drop=True)

pred = best_pipe.predict(X_test)
proba = best_pipe.predict_proba(X_test)[:, 1]

export_df = nice_subset.copy()
export_df["Actual_Churn"] = y_test.reset_index(drop=True)
export_df["Predicted_Churn"] = pred
export_df["Churn_Probability"] = proba
export_df["Best_Model"] = best_name

export_df.to_csv("customer_churn_insights.csv", index=False)
print("Saved customer_churn_insights.csv rows:", export_df.shape[0])
display(export_df.head())

# Final check — list CSVs
print("\nCSV files in current directory:")
print([f for f in os.listdir() if f.endswith(".csv")])


Saved feature_importance_logistic_regression.csv (rows): 20


Unnamed: 0,feature,coefficient,abs_coefficient
9,MultiProduct,-2.752851,2.752851
4,NumOfProducts,2.313823,2.313823
1,Age,0.750907,0.750907
16,Gender_Male,-0.665498,0.665498
12,Geography_France,-0.590766,0.590766


Saved feature_importance_random_forest.csv rows: 20


Unnamed: 0,feature,importance
1,Age,0.230207
4,NumOfProducts,0.106603
7,EstimatedSalary,0.098775
0,CreditScore,0.098566
3,Balance,0.096409


Saved feature_importance_xgboost.csv rows: 20


Unnamed: 0,feature,importance
4,NumOfProducts,0.150595
1,Age,0.099815
16,Gender_Male,0.09922
9,MultiProduct,0.095049
13,Geography_Germany,0.074553


Saved customer_churn_insights.csv rows: 2000


Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Actual_Churn,Predicted_Churn,Churn_Probability,Best_Model
0,France,Male,46,9,0.0,2,1,0,170676.67,0,0,0.078143,Random Forest
1,France,Female,31,0,118100.59,2,1,0,103165.15,0,0,0.129972,Random Forest
2,Spain,Male,42,4,0.0,2,0,1,85982.47,0,0,0.008944,Random Forest
3,Spain,Female,41,4,0.0,2,1,1,164549.74,0,0,0.019222,Random Forest
4,France,Male,42,2,0.0,2,1,1,55470.78,0,0,0.006667,Random Forest



CSV files in current directory:
['Churn_Modelling.csv', 'feature_importance_random_forest.csv', 'feature_importance_logistic_regression.csv', 'feature_importance_xgboost.csv', 'customer_churn_insights.csv']


In [18]:
# Cell 11 — download (one browser download per file)
from google.colab import files
to_download = [
    "feature_importance_logistic_regression.csv",
    "feature_importance_random_forest.csv",
    "feature_importance_xgboost.csv",
    "customer_churn_insights.csv"
]

for f in to_download:
    if os.path.exists(f):
        print("Downloading", f)
        files.download(f)
    else:
        print("Not found, skipping:", f)


Downloading feature_importance_logistic_regression.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading feature_importance_random_forest.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading feature_importance_xgboost.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading customer_churn_insights.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>