In [10]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import logit
from sklearn.metrics import accuracy_score

# Load dataset
car_insurance_df = pd.read_csv(
    "https://raw.githubusercontent.com/AdedoyinJoshuaMastery/Car_Insurance_Claim_Prediction/main/data/car_insurance.csv"
)

# Drop 'id' column
car_insurance_df = car_insurance_df.drop(columns=['id'])

X = car_insurance_df.drop(columns=["outcome"])
y = car_insurance_df["outcome"]

# Correlation check (numeric features only)
correlation_matrix = car_insurance_df.select_dtypes(include=[np.number]).corr()
correlations = correlation_matrix['outcome'].drop('outcome', errors='ignore')
best_feature = correlations.abs().idxmax() if not correlations.empty else None
print(f"The best feature predictor (correlation method) is: {best_feature}")

best_accuracy = 0
best_feature_final = None

for feature in X.columns:
    # If feature is categorical, wrap in C() for formula
    if car_insurance_df[feature].dtype == 'object':
        formula = f'outcome ~ C({feature})'
    else:
        formula = f'outcome ~ {feature}'

    # Fit logistic regression
    mdl_outcome = logit(formula, data=car_insurance_df).fit(disp=False)

    # Predict using the full DataFrame
    y_pred = mdl_outcome.predict(car_insurance_df)
    y_pred_binary = (y_pred > 0.5).astype(int)

    accuracy = accuracy_score(y, y_pred_binary)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_feature_final = feature

best_feature_df = pd.DataFrame({
    'best_feature': [best_feature_final],
    'best_accuracy': [best_accuracy]
})

print(best_feature_df)

The best feature predictor (correlation method) is: age
         best_feature  best_accuracy
0  driving_experience         0.7771
