In [3]:
import os
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("retentiondata_case.csv")
print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head()

Shape: (5636, 36)
Columns: ['acct_ref', 'cust_ref', 'has_dependents', 'tenure_mo', 'home_phone', 'multi_line', 'internet_plan', 'add_on_security', 'add_on_backup', 'add_on_protection', 'tech_support_std', 'stream_tv', 'stream_movies', 'contract_term', 'e_bill_opt_in', 'pay_method', 'monthly_fee', 'total_billed', 'left_flag', 'fiscal_qtr', 'gender', 'age_years', 'is_married', 'dependents_count', 'referred_friend', 'referrals_count', 'recent_offer', 'avg_long_dist_fee', 'internet_tech', 'avg_gb_download', 'premium_support', 'stream_music', 'unlimited_data_opt', 'refunds_total', 'extra_data_fees_total', 'long_dist_fees_total']


Unnamed: 0,acct_ref,cust_ref,has_dependents,tenure_mo,home_phone,multi_line,internet_plan,add_on_security,add_on_backup,add_on_protection,...,recent_offer,avg_long_dist_fee,internet_tech,avg_gb_download,premium_support,stream_music,unlimited_data_opt,refunds_total,extra_data_fees_total,long_dist_fees_total
0,ACCT137932,CUST61880322,No,1,No,No phone service,DSL,No,Yes,No,...,Offer E,0.0,DSL,10,No,No,Yes,0.0,0,0.0
1,ACCT115088,CUST55192304,No,34,Yes,No,DSL,Yes,No,Yes,...,,17.09,DSL,16,No,No,Yes,0.0,0,581.06
2,ACCT623423,CUST16890051,No,2,Yes,No,DSL,Yes,Yes,No,...,,10.47,DSL,21,No,No,Yes,0.0,0,20.94
3,ACCT846960,CUST99388728,No,45,No,No phone service,DSL,Yes,No,Yes,...,,0.0,DSL,10,Yes,No,Yes,0.0,0,0.0
4,ACCT146586,CUST91125265,No,2,Yes,No,Fiber optic,No,No,No,...,,9.12,Fiber Optic,51,No,No,Yes,0.0,0,18.24


In [5]:
df = df.drop(columns = ['acct_ref', 'cust_ref','fiscal_qtr','recent_offer','gender'])
df['internet_tech'] = df['internet_tech'].fillna("No Internet")
df['total_billed'] = df['total_billed'].fillna(df['total_billed'].median())

In [6]:
df['stream_count'] = (df[['stream_tv', 'stream_movies', 'stream_music']] == 'Yes').sum(axis=1)
df['add_on_count'] = (df[['add_on_security', 'add_on_backup', 'add_on_protection']] == 'Yes').sum(axis=1)

In [7]:
df['tenure_group'] = pd.cut(
    df['tenure_mo'],
    bins=[0, 24, 48, float('inf')],
    labels=['Under 2 Years', '2-4 Years', 'Over 4 Years'],
    right=False
)

In [8]:
df['avg_monthly_spent'] = np.where(
    df['tenure_mo'] == 0,
    0,
    df['total_billed'] / df['tenure_mo']
)

In [9]:
Y = (df["left_flag"] == "Yes").astype("int64") # returns True for rows where response == 1 and False otherwise then .astype(int) converts True→1 and False→0, so the “positive” label becomes 1
X = df.drop(columns=["left_flag"]) 

In [10]:
from sklearn.model_selection import train_test_split, StratifiedKFold

#Split into train/test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, 
    test_size=0.3,     # 30% test, 70% train
    random_state=99,   # reproducibility
    shuffle=True       # shuffle before splitting
)

# Check shapes
X_train.shape, X_test.shape

((3945, 34), (1691, 34))

In [17]:
    # StandardScaler:     standardizes features (zero mean, unit variance)
    #                     important for LASSO because they are scale-sensitive
    from sklearn.preprocessing import StandardScaler
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LogisticRegressionCV
    
    
    # Stratified CV preserves class balance in each fold (important for classification)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=20)
    # Lambda grid: 0001 to 2.0 in 20 steps
    lambdas = np.linspace(0.001, 2.0, 20)
    
    numeric_cols = X_train.select_dtypes(include=['int64','float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object','category','bool']).columns
    
    preprocess = ColumnTransformer([
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ])
    
    
    logit_lasso_pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", LogisticRegressionCV(
            Cs=1/lambdas,
            cv=cv,
            penalty="l1",
            solver="saga",
            scoring="roc_auc",
            max_iter=2000,
            n_jobs=-1,
            refit=True
        ))
    ])
    
    
    logit_lasso_pipe.fit(X_train, Y_train)
    
    best_C = logit_lasso_pipe.named_steps["model"].C_[0] # best 1/lambda value the minimizes AUC on holdout sets in cross-validation
    best_lambda = 1.0 / best_C
    
    print(f"Best C selected by CV: {best_C:.6g}   (λ = 1/C ≈ {best_lambda:.6g})")



Best C selected by CV: 0.593694   (λ = 1/C ≈ 1.68437)


In [20]:
# Get coefficients from the fitted LASSO model it automatically chooses the best C(1/λ)
lasso_model = logit_lasso_pipe.named_steps["model"]

preprocess = logit_lasso_pipe.named_steps["preprocess"]
feature_names = preprocess.get_feature_names_out()


# Match each coefficient with its feature name
coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": lasso_model.coef_.ravel()   # .coef_ is shape (1, n_features) for binary;
                                               # ravel() flattens it into a 1-D array
})

# Keep only non-zero coefficients (the ones LASSO thinks are important)
important_coefs = coef_df[coef_df["coefficient"] != 0]

# Sort by absolute size of the coefficient -- the larger the more important
important_coefs = important_coefs.reindex(
    important_coefs["coefficient"].abs().sort_values(ascending=False).index
)

print("Important features selected by LASSO:")
display(important_coefs)

Important features selected by LASSO:


Unnamed: 0,feature,coefficient
22,cat__internet_plan_Fiber optic,2.046581
57,cat__internet_tech_Fiber Optic,-1.86159
5,num__referrals_count,-1.831631
0,num__tenure_mo,-1.309198
53,cat__referred_friend_No,-1.177706
4,num__dependents_count,-1.022335
14,cat__has_dependents_No,-0.965844
42,cat__contract_term_Month-to-month,0.845767
66,cat__tenure_group_Over 4 Years,0.569305
51,cat__is_married_No,-0.440464


In [22]:
Y_pred_lasso = logit_lasso_pipe.predict(X_test)

In [23]:
# Manual calculation of MSE for LASSO
errors = Y_test - Y_pred_lasso        # residuals = actual - predicted
squared_errors = errors ** 2          # square each residual
mse = squared_errors.mean()    # average of squared residuals

print("Test MSE — LASSO (manual):", mse)

Test MSE — LASSO (manual): 0.17740981667652278


In [24]:
from sklearn.metrics import roc_auc_score
 
y_prob = logit_lasso_pipe.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(Y_test, y_prob)
 
print(f"Test ROC-AUC: {roc_auc:.3f}")

Test ROC-AUC: 0.888
