### Logistic Regression Analysis:

The following code will help determine which features are the best indicators that a customer will lodge a complaint.

All data used has been simulated for this use-case.

In [None]:
# Import required libraries:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm

In [None]:
# Load in data:

df = pd.read_csv(r'C:\Users\bryce\OneDrive\Documents\Python Scripts\Data_Analysis\Data\customer_complaints_sample.csv')

df.head()

Unnamed: 0,customer_id,complaint_raised,late_payments_last_6m,avg_bill_amount,billing_adjustments_count,num_active_accounts,call_volume_last_30d,billing_calls_last_30d,fault_calls_last_30d
0,1,0,0,139.14,4,1,3,4,3
1,2,1,1,105.12,0,3,1,1,2
2,3,0,2,63.48,1,1,1,2,0
3,4,0,2,117.95,0,2,1,2,3
4,5,0,0,148.05,1,3,1,2,0


In [None]:
# Define features and target:

target = "complaint_raised"

features = [
    "late_payments_last_6m",
    "avg_bill_amount",
    "billing_adjustments_count",
    "num_active_accounts",
    "call_volume_last_30d",
    "billing_calls_last_30d",
    "fault_calls_last_30d"
]

X = df[features]
y = df[target]

In [23]:
# Split data into training and test cohorts:

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [None]:
# Build logistic regression pipeline:

pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        solver="liblinear",
        penalty="l2",
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)


In [None]:
# Model evaluation:

y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred))


ROC AUC: 0.2608695652173913
              precision    recall  f1-score   support

           0       0.60      0.39      0.47        23
           1       0.07      0.14      0.09         7

    accuracy                           0.33        30
   macro avg       0.33      0.27      0.28        30
weighted avg       0.48      0.33      0.38        30



In [27]:
# Coefficient interpretation:

coefficients = pipeline.named_steps["logreg"].coef_[0]

coef_df = pd.DataFrame({
    "feature": features,
    "coefficient": coefficients,
    "odds_ratio": np.exp(coefficients)
}).sort_values(by="odds_ratio", ascending=False)

# How to read this

# odds_ratio > 1 → increases likelihood of complaint

# odds_ratio < 1 → decreases likelihood

# Because features are standardised, magnitude is comparable

coef_df


Unnamed: 0,feature,coefficient,odds_ratio
0,late_payments_last_6m,0.292684,1.340019
2,billing_adjustments_count,0.289626,1.335928
4,call_volume_last_30d,-0.018698,0.981476
1,avg_bill_amount,-0.030857,0.969614
6,fault_calls_last_30d,-0.042115,0.95876
5,billing_calls_last_30d,-0.21,0.810584
3,num_active_accounts,-0.307828,0.735042


### Executive Summary:

##### Complaints are primarily driven by financial friction rather than service volume.
* Customers with payment issues or billing corrections are ~33–34% more likely to complain, even after controlling for contact volume and account complexity.
* Multi-service customers are ~26% less likely to complain.
* Customers who contacted billing were ~19% less likely to complain.
* This does not mean billing calls prevent complaints. It means customers who successfully engage billing are less likely to escalate.

In [33]:
# Stats model version for P-values:

X_sm = sm.add_constant(X)
logit_model = sm.Logit(y, X_sm)
result = logit_model.fit()

print(result.summary())


Optimization terminated successfully.
         Current function value: 0.539059
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:       complaint_raised   No. Observations:                  100
Model:                          Logit   Df Residuals:                       92
Method:                           MLE   Df Model:                            7
Date:                Sun, 11 Jan 2026   Pseudo R-squ.:                 0.02181
Time:                        17:15:01   Log-Likelihood:                -53.906
converged:                       True   LL-Null:                       -55.108
Covariance Type:            nonrobust   LLR p-value:                    0.9341
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                        -0.7228      1.344     -0.538      0.591      -3.

In [34]:
df["billing_call_pressure"] = (
    df["billing_calls_last_30d"] * df["billing_adjustments_count"]
)

In [35]:
propensity_scoring = df["complaint_propensity"] = pipeline.predict_proba(X)[:, 1]

In [36]:
propensity_scoring

array([0.64601507, 0.38922686, 0.66873423, 0.46697209, 0.38385891,
       0.28315568, 0.2847202 , 0.65027716, 0.60347266, 0.28665171,
       0.52284198, 0.70184422, 0.59133415, 0.34852991, 0.59495899,
       0.36704009, 0.58108052, 0.57967092, 0.67526601, 0.28189092,
       0.31520337, 0.31317283, 0.41880851, 0.40852776, 0.54021388,
       0.45597116, 0.72896876, 0.50316102, 0.45194643, 0.51753493,
       0.27349971, 0.51885455, 0.6573062 , 0.41118903, 0.66461787,
       0.56428242, 0.4240042 , 0.57219588, 0.42534559, 0.41698957,
       0.60587189, 0.581448  , 0.61356115, 0.3570982 , 0.53835151,
       0.34202293, 0.9096319 , 0.61800897, 0.57254001, 0.40213048,
       0.58585   , 0.28946928, 0.4817679 , 0.28569713, 0.66077115,
       0.45086406, 0.75454239, 0.33839089, 0.44585012, 0.32747286,
       0.62952805, 0.53023269, 0.41163647, 0.45177699, 0.38416087,
       0.44731063, 0.57693007, 0.78807064, 0.59621358, 0.5449587 ,
       0.45089273, 0.52615422, 0.51620041, 0.59048529, 0.36917