In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# 1. Load data
df = pd.read_csv("Training_Dataset.csv")

# 2. Drop unused columns
df = df.drop(columns=["ic_number", "plate_number", "coverage_amount"])

# 3. Separate features & target
X = df.drop(columns="approval_flag")
y = df["approval_flag"]

# 4. Identify column types
numeric_features = [
    "age", "months_as_customer", "vehicle_age_years",
    "policy_expired_flag", "deductible_amount", "market_value",
    "damage_severity_score", "repair_amount", "at_fault_flag",
    "time_to_report_days", "claim_reported_to_police_flag",
    "license_type_missing_flag", "num_third_parties", "num_witnesses"
]
categorical_features = ["vehicle_make"]

# 5. Build preprocessing & modeling pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first", sparse=False), categorical_features),
])

clf = Pipeline([
    ("preproc", preprocessor),
    ("logreg", LogisticRegression(max_iter=1000, class_weight="balanced")),
])

# 6. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 7. Fit model
clf.fit(X_train, y_train)

# 8. Evaluate on hold-out set
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC AUC:", roc_auc_score(y_test, y_prob))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 9. Cross-validated AUC
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=cv, scoring="roc_auc")
print("5-fold CV AUC: %.3f ± %.3f" % (cv_scores.mean(), cv_scores.std()))

# 10. Inspect learned coefficients vs. feature names
#    (numeric scaled + one-hot cols)
# Inspect learned coefficients vs. feature names
feature_names_num = numeric_features
ohe = clf.named_steps["preproc"].named_transformers_["cat"]
feature_names_cat = list(ohe.get_feature_names_out(categorical_features))
feature_names = feature_names_num + feature_names_cat

coefs = clf.named_steps["logreg"].coef_[0]
coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coefs
})

# Sort by absolute coefficient magnitude
coef_df = coef_df.sort_values(
    by="coefficient",
    key=lambda col: col.abs(),
    ascending=False
)

print("\nTop features by absolute weight:")
print(coef_df.head(10))



Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.91      0.70       100
           1       0.99      0.92      0.96       900

    accuracy                           0.92      1000
   macro avg       0.78      0.92      0.83      1000
weighted avg       0.95      0.92      0.93      1000

ROC AUC: 0.9829111111111111
Confusion Matrix:
[[ 91   9]
 [ 69 831]]
5-fold CV AUC: 0.982 ± 0.006

Top features by absolute weight:
                          feature  coefficient
8                   at_fault_flag    -2.624088
3             policy_expired_flag    -2.558228
10  claim_reported_to_police_flag     2.491517
9             time_to_report_days    -2.352828
13                  num_witnesses     1.579771
1              months_as_customer     1.503320
15           vehicle_make_Mazda 3     1.041442
6           damage_severity_score    -0.760721
11      license_type_missing_flag    -0.520848
5                    market_value    -0.502518




In [2]:
import joblib

# clf is your trained Pipeline
joblib.dump(clf, "Approval_Model.pkl")

print("Model saved to approval_model.pkl")

Model saved to approval_model.pkl
