# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Resolution

In [7]:
df = pd.read_csv('data/car_insurance.csv')

x = df.drop(columns=['id', 'outcome'])
y = df['outcome']

results = []

for col in x.columns:
    xi = x[[col]]
    
    cat_cols = xi.select_dtypes(include=['object']).columns.tolist()
    num_cols = xi.select_dtypes(exclude=['object']).columns.tolist()
    
    preprocessor = ColumnTransformer(
        transformers = [
            ('num', SimpleImputer(strategy='mean'), num_cols),
            ('cat',Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
                ])
             ,cat_cols)
        ]
    )
    
    model = Pipeline([
        ('preprocess', preprocessor),
        ('clf', LogisticRegression(max_iter=1000))
    ])
    
    x_train, x_test, y_train, y_test = train_test_split(xi, y, test_size=0.2, random_state=42)
    
    model.fit(x_train,y_train)
    preds = model.predict(x_test)
    
    acc = accuracy_score(y_test, preds)
    results.append((col,acc))

results_df = pd.DataFrame(results, columns = ['feature', 'accuracy']).sort_values(by='accuracy',ascending = False)

best_feature_df = pd.DataFrame({
    'best_feature' : [results_df.iloc[0]['feature']],
    'best_accuracy': [results_df.iloc[0]['accuracy']]
})

print(best_feature_df)
print()
print(results_df)

  best_feature  best_accuracy
0          age          0.783

                feature  accuracy
0                   age    0.7830
2    driving_experience    0.7780
4                income    0.7440
6     vehicle_ownership    0.7280
5          credit_score    0.6990
11       annual_mileage    0.6890
3             education    0.6835
1                gender    0.6835
7          vehicle_year    0.6835
8               married    0.6835
9              children    0.6835
10          postal_code    0.6835
12         vehicle_type    0.6835
13  speeding_violations    0.6835
14                 duis    0.6835
15       past_accidents    0.6835


# Codigo que acepto datacamp


In [12]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import logit


df = pd.read_csv("data/car_insurance.csv")

df["outcome"] = df["outcome"].astype(int)

features = [col for col in df.columns if col not in ["id", "outcome"]]

results = []

for feature in features:
    formula = f"outcome ~ {feature}"
    
    try:
        model = logit(formula, data=df).fit(disp=0)
        
        preds = (model.predict(df) >= 0.5).astype(int)
        
        accuracy = np.mean(preds == df["outcome"])
        
        results.append((feature, accuracy))
        
    except:
        continue

best_feature, best_accuracy = max(results, key=lambda x: x[1])

best_feature_df = pd.DataFrame({
    "best_feature": [best_feature],
    "best_accuracy": [best_accuracy]
})

best_feature_df


Unnamed: 0,best_feature,best_accuracy
0,driving_experience,0.7771
