In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('claimants.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ATTORNEY  1340 non-null   int64  
 1   CLMSEX    1328 non-null   float64
 2   CLMINSUR  1299 non-null   float64
 3   SEATBELT  1292 non-null   float64
 4   CLMAGE    1151 non-null   float64
 5   LOSS      1340 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 62.9 KB


In [4]:
df.head()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,0,0.0,1.0,0.0,50.0,34.94
1,1,1.0,0.0,0.0,18.0,0.891
2,1,0.0,1.0,0.0,5.0,0.33
3,0,0.0,1.0,1.0,31.0,0.037
4,1,0.0,1.0,0.0,30.0,0.038


In [5]:
df.corr()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
ATTORNEY,1.0,0.079674,0.079439,-0.057189,0.01068,-0.219715
CLMSEX,0.079674,1.0,0.066413,-0.015024,-0.023195,-0.027611
CLMINSUR,0.079439,0.066413,1.0,0.021154,0.049493,0.005821
SEATBELT,-0.057189,-0.015024,0.021154,1.0,-0.027992,0.111189
CLMAGE,0.01068,-0.023195,0.049493,-0.027992,1.0,0.064612
LOSS,-0.219715,-0.027611,0.005821,0.111189,0.064612,1.0


In [6]:
df.dropna(axis=0,inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1096 entries, 0 to 1339
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ATTORNEY  1096 non-null   int64  
 1   CLMSEX    1096 non-null   float64
 2   CLMINSUR  1096 non-null   float64
 3   SEATBELT  1096 non-null   float64
 4   CLMAGE    1096 non-null   float64
 5   LOSS      1096 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 59.9 KB


In [8]:
y=df['ATTORNEY']
x=df.drop(['ATTORNEY'],axis=1)

In [9]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123456)

In [10]:
df.corr()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
ATTORNEY,1.0,0.097475,0.084609,-0.060795,0.001132,-0.225769
CLMSEX,0.097475,1.0,0.058164,-0.017814,-0.017942,-0.029742
CLMINSUR,0.084609,0.058164,1.0,0.021217,0.048166,0.019502
SEATBELT,-0.060795,-0.017814,0.021217,1.0,-0.028343,0.131182
CLMAGE,0.001132,-0.017942,0.048166,-0.028343,1.0,0.065513
LOSS,-0.225769,-0.029742,0.019502,0.131182,0.065513,1.0


In [11]:
def train_model_lg():
    from sklearn.linear_model import LogisticRegressionCV
    model=LogisticRegressionCV()
    model.fit(x_train,y_train)
    return model

In [12]:
def train_model_KNN():
    from sklearn.neighbors import KNeighborsClassifier
    model=KNeighborsClassifier()
    model.fit(x_train,y_train)
    return model

In [13]:
def train_model_svm():
    from sklearn.svm import SVC
    model=SVC()
    model.fit(x_train,y_train)
    return model

In [14]:
def train_model_nb():
    from sklearn.naive_bayes import GaussianNB
    model=GaussianNB()
    model.fit(x_train,y_train)
    return model

In [15]:
def train_model_rf():
    from sklearn.ensemble import RandomForestClassifier
    model=RandomForestClassifier()
    model.fit(x_train,y_train)
    return model

In [16]:
def train_model_dt():
    from sklearn.tree import DecisionTreeClassifier
    model=DecisionTreeClassifier()
    model.fit(x_train,y_train)
    return model

In [17]:
def evaluate_model(model,model_name):
    from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
    y_pred=model.predict(x_test)
    y_true=y_test
    accuracy=accuracy_score(y_true,y_pred)
    precision=precision_score(y_true,y_pred)
    recall=recall_score(y_true,y_pred)
    f1=f1_score(y_true,y_pred)
    return model_name,accuracy,precision,recall,f1
    

In [18]:
models=[
    (train_model_lg(),"Logistic Regression"),
    (train_model_KNN(),"KNN"),
    (train_model_nb(),"Naive Bays"),
    (train_model_svm(),"Support Vector Machine"),
    (train_model_dt(),"Decision Tree"),
    (train_model_rf(),"Random Forest")
]

In [20]:
results=[]
for model,model_name in models:
    results.append(evaluate_model(model,model_name))

In [21]:
result_df=pd.DataFrame(results,columns=['Algorithm','accuracy','precision','recall','f1'])

In [22]:
result_df

Unnamed: 0,Algorithm,accuracy,precision,recall,f1
0,Logistic Regression,0.665653,0.629412,0.694805,0.660494
1,KNN,0.632219,0.609272,0.597403,0.603279
2,Naive Bays,0.574468,0.529412,0.818182,0.642857
3,Support Vector Machine,0.641337,0.588235,0.779221,0.670391
4,Decision Tree,0.659574,0.62069,0.701299,0.658537
5,Random Forest,0.665653,0.644737,0.636364,0.640523


In [27]:
x.columns

Index(['CLMSEX', 'CLMINSUR', 'SEATBELT', 'CLMAGE', 'LOSS'], dtype='object')

In [25]:
df.head()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,0,0.0,1.0,0.0,50.0,34.94
1,1,1.0,0.0,0.0,18.0,0.891
2,1,0.0,1.0,0.0,5.0,0.33
3,0,0.0,1.0,1.0,31.0,0.037
4,1,0.0,1.0,0.0,30.0,0.038


In [28]:
from scipy.stats import mode
def predict_result(CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS):
    results = []
    for (model, model_name) in models:
        print(f"prediction using {model_name} = {model.predict([[CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS]])}")
        results.append(model.predict([[CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS]])[0])

    final_result = mode(results)
    print(f"the prediction for the ATTORNEY = {final_result.mode}")

In [30]:
predict_result(1,1,0,45,34)

prediction using Logistic Regression = [0]
prediction using KNN = [0]
prediction using Naive Bays = [0]
prediction using Support Vector Machine = [0]
prediction using Decision Tree = [0]
prediction using Random Forest = [0]
the prediction for the ATTORNEY = 0


