In [1]:
# Import required modules
import pandas as pd
import numpy as np
from statsmodels.formula.api import logit

In [None]:
# Read in dataset
cars = pd.read_csv("car_insurance.csv")

In [None]:
# Fill missing values with the mean
cars["credit_score"].fillna(cars["credit_score"].mean(), inplace=True)
cars["annual_mileage"].fillna(cars["annual_mileage"]

In [None]:
# Empty list to store model results
models = []

In [None]:
# Feature columns
features = cars.drop(columns=["id", "outcome"]).columns

In [None]:
# Loop through features
for col in features:
    # Create a model
    model = logit(f"outcome ~ {col}", data=cars).fit()
    # Add each model to the models list
    models.append(model)

In [None]:
# Empty list to store accuracies
accuracies = []

In [None]:
# Loop through models
for feature in range(0, len(models)):
    # Compute the confusion matrix
    conf_matrix = models[feature].pred_table()
    # True negatives
    tn = conf_matrix[0,0]
    # True positives
    tp = conf_matrix[1,1]
    # False negatives
    fn = conf_matrix[1,0]
    # False positives
    fp = conf_matrix[0,1]
    # Compute accuracy
    acc = (tn + tp) / (tn + fn + fp + tp)
    accuracies.append(acc)

In [None]:
# Find the feature with the largest accuracy
best_feature = features[accuracies.index(max(accuracies))]

In [None]:
# Create best_feature_df
best_feature_df = pd.DataFrame({"best_feature": best_feature,
                                "best_accuracy": max(accuracies)},
                                index=[0])
best_feature_df