In [6]:
import pandas as pd
from statsmodels.formula.api import logit

drivers = pd.read_csv('./datasets/Car_Insurance_Claim.csv')
print(drivers.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   10000 non-null  int64  
 1   AGE                  10000 non-null  object 
 2   GENDER               10000 non-null  object 
 3   RACE                 10000 non-null  object 
 4   DRIVING_EXPERIENCE   10000 non-null  object 
 5   EDUCATION            10000 non-null  object 
 6   INCOME               10000 non-null  object 
 7   CREDIT_SCORE         9018 non-null   float64
 8   VEHICLE_OWNERSHIP    10000 non-null  float64
 9   VEHICLE_YEAR         10000 non-null  object 
 10  MARRIED              10000 non-null  float64
 11  CHILDREN             10000 non-null  float64
 12  POSTAL_CODE          10000 non-null  int64  
 13  ANNUAL_MILEAGE       9043 non-null   float64
 14  VEHICLE_TYPE         10000 non-null  object 
 15  SPEEDING_VIOLATIONS  10000 non-null  

In [None]:
# Fill missing values with Column mean
credit_score_mean = drivers['CREDIT_SCORE'].mean()
mileage_mean = drivers['ANNUAL_MILEAGE'].mean()

drivers['CREDIT_SCORE'] = drivers['CREDIT_SCORE'].fillna(credit_score_mean)
drivers['ANNUAL_MILEAGE'] = drivers['ANNUAL_MILEAGE'].fillna(mileage_mean)

print(drivers.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   10000 non-null  int64  
 1   AGE                  10000 non-null  object 
 2   GENDER               10000 non-null  object 
 3   RACE                 10000 non-null  object 
 4   DRIVING_EXPERIENCE   10000 non-null  object 
 5   EDUCATION            10000 non-null  object 
 6   INCOME               10000 non-null  object 
 7   CREDIT_SCORE         10000 non-null  float64
 8   VEHICLE_OWNERSHIP    10000 non-null  float64
 9   VEHICLE_YEAR         10000 non-null  object 
 10  MARRIED              10000 non-null  float64
 11  CHILDREN             10000 non-null  float64
 12  POSTAL_CODE          10000 non-null  int64  
 13  ANNUAL_MILEAGE       10000 non-null  float64
 14  VEHICLE_TYPE         10000 non-null  object 
 15  SPEEDING_VIOLATIONS  10000 non-null  

In [16]:
models_list = []
features = drivers.drop(['ID','OUTCOME'], axis=1).columns

for col in features:
    model = logit(f"OUTCOME ~ {col}", data=drivers).fit()
    models_list.append(model)

accuracies = []
for mod in range(0, len(models_list)):
    conf_matrix = models_list[mod].pred_table()
    TN = conf_matrix[0,0]
    TP = conf_matrix[1,1]
    FP = conf_matrix[0,1]
    FN = conf_matrix[1,0]

    accuracy = (TN + TP) / (TN + TP + FN + FP)
    accuracies.append(accuracy)

best_feature = features[accuracies.index(max(accuracies))]
best_feature_df = pd.DataFrame({'Best feature': best_feature,
                                'Best accuracy':max(accuracies)}, index=[0])

print(accuracies)
print(best_feature_df)

Optimization terminated successfully.
         Current function value: 0.506484
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.615951
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621682
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.467092
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.603742
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.531499
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.572557
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.552412
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.572668
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.586659
  