In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('claimants.csv')

In [3]:
df.head()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,0,0.0,1.0,0.0,50.0,34.94
1,1,1.0,0.0,0.0,18.0,0.891
2,1,0.0,1.0,0.0,5.0,0.33
3,0,0.0,1.0,1.0,31.0,0.037
4,1,0.0,1.0,0.0,30.0,0.038


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ATTORNEY  1340 non-null   int64  
 1   CLMSEX    1328 non-null   float64
 2   CLMINSUR  1299 non-null   float64
 3   SEATBELT  1292 non-null   float64
 4   CLMAGE    1151 non-null   float64
 5   LOSS      1340 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 62.9 KB


In [5]:
df = df.dropna()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1096 entries, 0 to 1339
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ATTORNEY  1096 non-null   int64  
 1   CLMSEX    1096 non-null   float64
 2   CLMINSUR  1096 non-null   float64
 3   SEATBELT  1096 non-null   float64
 4   CLMAGE    1096 non-null   float64
 5   LOSS      1096 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 59.9 KB


In [6]:
df.corr()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
ATTORNEY,1.0,0.097475,0.084609,-0.060795,0.001132,-0.225769
CLMSEX,0.097475,1.0,0.058164,-0.017814,-0.017942,-0.029742
CLMINSUR,0.084609,0.058164,1.0,0.021217,0.048166,0.019502
SEATBELT,-0.060795,-0.017814,0.021217,1.0,-0.028343,0.131182
CLMAGE,0.001132,-0.017942,0.048166,-0.028343,1.0,0.065513
LOSS,-0.225769,-0.029742,0.019502,0.131182,0.065513,1.0


In [7]:
x = df.drop(['ATTORNEY'],axis=1)
y = df['ATTORNEY']

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=123456)

In [9]:
def train_model_lg():
    from sklearn.linear_model import LogisticRegressionCV

    model = LogisticRegressionCV()
    model.fit(x_train, y_train)
    return model

In [10]:
def train_model_knn():
    from sklearn.neighbors import KNeighborsClassifier

    model = KNeighborsClassifier()
    model.fit(x_train, y_train)
    return model

In [11]:
def train_model_nb():
    from sklearn.naive_bayes import GaussianNB

    model = GaussianNB()
    model.fit(x_train, y_train)
    return model

In [12]:
def train_model_svm():
    from sklearn.svm import SVC

    model = SVC(C=2.0)
    model.fit(x_train, y_train)
    return model

In [13]:
def train_model_dt():
    from sklearn.tree import DecisionTreeClassifier

    model = DecisionTreeClassifier(max_depth=3)
    model.fit(x_train, y_train)
    return model

In [14]:
def evaluate_model(model, model_name):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    y_pred = model.predict(x_test)
    y_true = y_test
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return model_name, accuracy, precision, recall, f1

In [15]:
models = [
    (train_model_lg(), 'Logistic Regression', (0, 0)),
    (train_model_nb(), 'Naive Bayes', (0, 1)),
    (train_model_knn(), 'KNN', (1, 0)),
    (train_model_svm(), 'SVM', (1, 1)),
    (train_model_dt(), 'Decision Tree', (2, 0)),
]

In [16]:
results = []
for (model, model_name, _) in models:
    results.append(evaluate_model(model, model_name))

In [17]:
result_df = pd.DataFrame(results, columns=['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
result_df

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.656934,0.621622,0.707692,0.661871
1,Naive Bayes,0.583942,0.541237,0.807692,0.648148
2,KNN,0.620438,0.601562,0.592308,0.596899
3,SVM,0.671533,0.623457,0.776923,0.691781
4,Decision Tree,0.711679,0.725664,0.630769,0.674897


In [18]:
from scipy.stats import mode
def predict_result(CLMAGE,CLMSEX,SEATBELT,CLMINSUR,LOSS):    
    # get the prediction using all the models
    results = []
    for (model, model_name, position) in models:
        print(f"prediction using {model_name} = {model.predict([[CLMAGE,CLMSEX,SEATBELT,CLMINSUR,LOSS]])}")
        results.append(model.predict([[CLMAGE,CLMSEX,SEATBELT,CLMINSUR,LOSS]])[0])

    final_result = mode(results)
    print(f"the prediction for the patient = {final_result.mode}")

In [21]:
predict_result(0,1,0,18,0)

prediction using Logistic Regression = [1]
prediction using Naive Bayes = [1]
prediction using KNN = [1]
prediction using SVM = [1]
prediction using Decision Tree = [1]
the prediction for the patient = 1


