# HW2 of DAML
- Dataset: insurance dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('./Insurance_dataset.csv')

In [3]:
print("Train columns:", train_df.columns)

Train columns: Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')
Test columns: Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage'],
      dtype='object')


In [4]:
train_set, val_set = train_test_split(train_df, test_size=0.2, random_state=42)

X_train = train_set.drop(columns=['Response'])
y_train = train_set['Response']
X_val = val_set.drop(columns=['Response'])
y_val = val_set['Response']

In [5]:
X_train

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
20293,380337,Male,30,1,28.0,1,< 1 Year,No,60954.0,152.0,127
33179,282196,Male,26,1,6.0,1,< 1 Year,No,24532.0,152.0,216
250682,60095,Male,40,1,0.0,0,1-2 Year,Yes,2630.0,47.0,220
323143,124730,Male,25,1,8.0,1,< 1 Year,No,44259.0,152.0,223
371317,474060,Female,26,1,28.0,1,< 1 Year,No,33615.0,152.0,194
...,...,...,...,...,...,...,...,...,...,...,...
259178,290561,Male,21,1,28.0,1,< 1 Year,No,43013.0,152.0,92
365838,279902,Female,27,1,33.0,1,< 1 Year,No,37057.0,152.0,268
131932,201369,Male,70,1,33.0,0,1-2 Year,Yes,32811.0,26.0,183
146867,110405,Male,41,1,41.0,0,1-2 Year,Yes,30833.0,124.0,14


In [7]:
# one-hot encoding
X_train = pd.get_dummies(X_train, columns=['Gender', 'Vehicle_Age', 'Vehicle_Damage'])
X_val = pd.get_dummies(X_val, columns=['Gender', 'Vehicle_Age', 'Vehicle_Damage'])

# make sure that train and val have the same column
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

# get rid of the special character
X_train.columns = X_train.columns.str.replace('[', '', regex=False).str.replace(']', '', regex=False).str.replace('<', '', regex=False).str.replace(' ', '_', regex=False)
X_val.columns = X_val.columns.str.replace('[', '', regex=False).str.replace(']', '', regex=False).str.replace('<', '', regex=False).str.replace(' ', '_', regex=False)

# print the column name after encoding
print("Encoded feature columns:", X_train.columns)


Encoded feature columns: Index(['id', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Gender_Female',
       'Gender_Male', 'Vehicle_Age_1-2_Year', 'Vehicle_Age__1_Year',
       'Vehicle_Age_>_2_Years', 'Vehicle_Damage_No', 'Vehicle_Damage_Yes'],
      dtype='object')


In [8]:
X_train

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender_Female,Gender_Male,Vehicle_Age_1-2_Year,Vehicle_Age__1_Year,Vehicle_Age_>_2_Years,Vehicle_Damage_No,Vehicle_Damage_Yes
20293,380337,30,1,28.0,1,60954.0,152.0,127,False,True,False,True,False,True,False
33179,282196,26,1,6.0,1,24532.0,152.0,216,False,True,False,True,False,True,False
250682,60095,40,1,0.0,0,2630.0,47.0,220,False,True,True,False,False,False,True
323143,124730,25,1,8.0,1,44259.0,152.0,223,False,True,False,True,False,True,False
371317,474060,26,1,28.0,1,33615.0,152.0,194,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,290561,21,1,28.0,1,43013.0,152.0,92,False,True,False,True,False,True,False
365838,279902,27,1,33.0,1,37057.0,152.0,268,True,False,False,True,False,True,False
131932,201369,70,1,33.0,0,32811.0,26.0,183,False,True,True,False,False,False,True
146867,110405,41,1,41.0,0,30833.0,124.0,14,False,True,True,False,False,False,True


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
import joblib

# initialize
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'SVC': SVC(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=10000),
    'Gaussian NB': GaussianNB(),
    'Multinomial NB': MultinomialNB()
}

results = {}
best_model_name = None
best_accuracy = 0

In [11]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    # evaluation matrix
    cm = confusion_matrix(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    
    # print the result
    print(f"Model: {name}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(classification_report(y_val, y_pred))
    print("="*60)
    
    results[name] = {
        'Confusion Matrix': cm,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'Classification Report': classification_report(y_val, y_pred, output_dict=True)
    }
    
    # check the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name
        best_model = model

Model: Decision Tree
Confusion Matrix:
[[57907  5882]
 [ 5380  7262]]
Accuracy: 0.8527
Precision: 0.5525
Recall: 0.5744
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     63789
           1       0.55      0.57      0.56     12642

    accuracy                           0.85     76431
   macro avg       0.73      0.74      0.74     76431
weighted avg       0.86      0.85      0.85     76431

Model: Random Forest
Confusion Matrix:
[[62833   956]
 [ 7241  5401]]
Accuracy: 0.8928
Precision: 0.8496
Recall: 0.4272
              precision    recall  f1-score   support

           0       0.90      0.99      0.94     63789
           1       0.85      0.43      0.57     12642

    accuracy                           0.89     76431
   macro avg       0.87      0.71      0.75     76431
weighted avg       0.89      0.89      0.88     76431

Model: XGBoost
Confusion Matrix:
[[63505   284]
 [ 7581  5061]]
Accuracy: 0.8971
Precision: 0.9469
Recall:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: SVC
Confusion Matrix:
[[63789     0]
 [12642     0]]
Accuracy: 0.8346
Precision: 0.0000
Recall: 0.0000
              precision    recall  f1-score   support

           0       0.83      1.00      0.91     63789
           1       0.00      0.00      0.00     12642

    accuracy                           0.83     76431
   macro avg       0.42      0.50      0.45     76431
weighted avg       0.70      0.83      0.76     76431

Model: KNN
Confusion Matrix:
[[60916  2873]
 [11069  1573]]
Accuracy: 0.8176
Precision: 0.3538
Recall: 0.1244
              precision    recall  f1-score   support

           0       0.85      0.95      0.90     63789
           1       0.35      0.12      0.18     12642

    accuracy                           0.82     76431
   macro avg       0.60      0.54      0.54     76431
weighted avg       0.76      0.82      0.78     76431

Model: Logistic Regression
Confusion Matrix:
[[63202   587]
 [12284   358]]
Accuracy: 0.8316
Precision: 0.3788
Recall: 0.0283


In [12]:
# save the result into txt file
with open('model_evaluation_results.txt', 'w') as f:
    for name, result in results.items():
        f.write(f"Model: {name}\n")
        f.write(f"Confusion Matrix:\n{result['Confusion Matrix']}\n")
        f.write(f"Accuracy: {result['Accuracy']:.4f}\n")
        f.write(f"Precision: {result['Precision']:.4f}\n")
        f.write(f"Recall: {result['Recall']:.4f}\n")
        f.write(f"Classification Report:\n{result['Classification Report']}\n")
        f.write("="*60 + "\n")

# save the best model
joblib.dump(best_model, 'best_model.joblib')

print(f"Best model: {best_model_name} with accuracy: {best_accuracy:.4f}")
print("Model evaluation results saved to 'model_evaluation_results.txt'")
print("Best model saved to 'best_model.joblib'")

Best model: XGBoost with accuracy: 0.8971
Model evaluation results saved to 'model_evaluation_results.txt'
Best model saved to 'best_model.joblib'


## Testing

In [19]:
test_df = pd.read_csv('./Insurance_validation.csv')
print("Test columns:", test_df.columns)

X_test = pd.get_dummies(test_df, columns=['Gender', 'Vehicle_Age', 'Vehicle_Damage'])

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

X_test.columns = X_test.columns.str.replace('[', '', regex=False).str.replace(']', '', regex=False).str.replace('<', '', regex=False).str.replace(' ', '_', regex=False)

In [20]:
best_model = joblib.load('best_model.joblib')

test_df['Response'] = best_model.predict(X_test)

test_df[['id', 'Response']].to_csv('insurance_predictions.csv', index=False)

print("Predictions saved to 'insurance_predictions.csv'")


Predictions saved to 'insurance_predictions.csv'
