# Import all the necessary libraries

In [103]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Import data

In [71]:
df = pd.read_csv('claimants.csv')

In [72]:
df

Unnamed: 0,CASENUM,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,5,0,0.0,1.0,0.0,50.0,34.940
1,3,1,1.0,0.0,0.0,18.0,0.891
2,66,1,0.0,1.0,0.0,5.0,0.330
3,70,0,0.0,1.0,1.0,31.0,0.037
4,96,1,0.0,1.0,0.0,30.0,0.038
...,...,...,...,...,...,...,...
1335,34100,1,0.0,1.0,0.0,,0.576
1336,34110,0,1.0,1.0,0.0,46.0,3.705
1337,34113,1,1.0,1.0,0.0,39.0,0.099
1338,34145,0,1.0,0.0,0.0,8.0,3.177


# Exploratory Data Analysis(EDA)

In [73]:
df.shape

(1340, 7)

In [74]:
df.dtypes

CASENUM       int64
ATTORNEY      int64
CLMSEX      float64
CLMINSUR    float64
SEATBELT    float64
CLMAGE      float64
LOSS        float64
dtype: object

In [75]:
df.isna().sum()

CASENUM       0
ATTORNEY      0
CLMSEX       12
CLMINSUR     41
SEATBELT     48
CLMAGE      189
LOSS          0
dtype: int64

In [76]:
df.columns

Index(['CASENUM', 'ATTORNEY', 'CLMSEX', 'CLMINSUR', 'SEATBELT', 'CLMAGE',
       'LOSS'],
      dtype='object')

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CASENUM   1340 non-null   int64  
 1   ATTORNEY  1340 non-null   int64  
 2   CLMSEX    1328 non-null   float64
 3   CLMINSUR  1299 non-null   float64
 4   SEATBELT  1292 non-null   float64
 5   CLMAGE    1151 non-null   float64
 6   LOSS      1340 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 73.4 KB


In [78]:
df.describe()

Unnamed: 0,CASENUM,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
count,1340.0,1340.0,1328.0,1299.0,1292.0,1151.0,1340.0
mean,11202.001493,0.488806,0.558735,0.907621,0.017028,28.414422,3.806307
std,9512.750796,0.500061,0.496725,0.289671,0.129425,20.304451,10.636903
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4177.0,0.0,0.0,1.0,0.0,9.0,0.4
50%,8756.5,0.0,1.0,1.0,0.0,30.0,1.0695
75%,15702.5,1.0,1.0,1.0,0.0,43.0,3.7815
max,34153.0,1.0,1.0,1.0,1.0,95.0,173.604


In [104]:
df['CLMSEX'].fillna(1, inplace=True) # 0 -> Female, 1 -> Male

In [105]:
df['SEATBELT'].fillna(1, inplace=True) # 1 -> wearing seatbelt, 0 -> Not wearing seatbelt

In [106]:
df['CLMINSUR'].fillna(1, inplace=True) # 1 -> They have Insurance, 0 -> They have not Insurance

In [107]:
df['CLMAGE'].fillna(df['CLMAGE'].mean(), inplace=True)

In [83]:
df.isna().sum()

CASENUM     0
ATTORNEY    0
CLMSEX      0
CLMINSUR    0
SEATBELT    0
CLMAGE      0
LOSS        0
dtype: int64

In [84]:
x = df.drop(labels=['CASENUM', 'ATTORNEY'], axis=1)

In [85]:
y = df[['ATTORNEY']]

# Model Validation Techniques
1 -> Train-Test-split
2 -> K-fold Cross Validation
3 -> Leave one out Cross Validation

In [151]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [215]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, train_size=0.80, random_state=42)

In [88]:
x_train.shape

(1072, 5)

In [89]:
y_train.shape

(1072, 1)

In [90]:
x_test.shape

(268, 5)

In [91]:
y_test.shape

(268, 1)

In [216]:
x_train

Unnamed: 0,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
1148,1.0,1.0,0.0,7.000000,3.734
807,0.0,1.0,0.0,16.000000,111.405
1287,0.0,0.0,0.0,10.000000,0.040
590,1.0,1.0,0.0,6.000000,0.100
1188,1.0,1.0,0.0,44.000000,13.000
...,...,...,...,...,...
1095,0.0,1.0,0.0,47.000000,0.080
1130,1.0,1.0,0.0,30.000000,5.595
1294,1.0,1.0,0.0,28.414422,1.041
860,1.0,0.0,0.0,55.000000,1.769


In [217]:
model = LogisticRegression()

In [218]:
model.fit(x_train, y_train)

In [219]:
y_pred_train = model.predict(x_train)

In [220]:
y_pred_train

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

In [221]:
accuracy_score(y_train, y_pred_train)

0.7042910447761194

In [222]:
y_pred_test = model.predict(x_test)

In [223]:
accuracy_score(y_test, y_pred_test)

0.7425373134328358

In [224]:
confusion_matrix(y_test, y_pred_test)

array([[ 85,  43],
       [ 26, 114]], dtype=int64)