Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

DATASET

In [None]:
data = pd.read_csv('Dataset_fraud_detection.csv')
X = data.drop(columns=['fraud_reported'])
y = data['fraud_reported']

In [None]:
print(X)

     months_as_customer  age  policy_number  policy_deductable  \
0                   328   48         521585               1000   
1                   228   42         342868               2000   
2                   134   29         687698               2000   
3                   256   41         227811               2000   
4                   228   44         367455               1000   
..                  ...  ...            ...                ...   
995                   3   38         941851               1000   
996                 285   41         186934               1000   
997                 130   34         918516                500   
998                 458   62         533940               2000   
999                 456   60         556080               1000   

     policy_annual_premium insured_gender insured_education_level  \
0                  1406.91           MALE                      MD   
1                  1197.22           MALE                      MD   


In [None]:
print(y)

0      1
1      1
2      0
3      1
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: fraud_reported, Length: 1000, dtype: int64


Encoding Categorical data

In [None]:
categorical_features = [5, 6, 9, 10, 11, 14, 17,22]
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output=False), categorical_features)],remainder='passthrough')
X_transformed = ct.fit_transform(X)

In [None]:
print(X_transformed)

[[0.000e+00 1.000e+00 0.000e+00 ... 6.510e+03 1.302e+04 5.208e+04]
 [0.000e+00 1.000e+00 0.000e+00 ... 7.800e+02 7.800e+02 3.510e+03]
 [1.000e+00 0.000e+00 0.000e+00 ... 7.700e+03 3.850e+03 2.310e+04]
 ...
 [1.000e+00 0.000e+00 0.000e+00 ... 7.500e+03 7.500e+03 5.250e+04]
 [0.000e+00 1.000e+00 1.000e+00 ... 5.220e+03 5.220e+03 3.654e+04]
 [1.000e+00 0.000e+00 1.000e+00 ... 4.600e+02 9.200e+02 3.680e+03]]


Spliting the data into test and train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3)

In [None]:
print(X_train)

[[1.000e+00 0.000e+00 0.000e+00 ... 7.340e+03 1.468e+04 5.872e+04]
 [1.000e+00 0.000e+00 0.000e+00 ... 7.050e+03 1.410e+04 4.935e+04]
 [1.000e+00 0.000e+00 0.000e+00 ... 1.138e+04 1.138e+04 3.983e+04]
 ...
 [0.000e+00 1.000e+00 0.000e+00 ... 8.400e+02 8.400e+02 2.940e+03]
 [1.000e+00 0.000e+00 0.000e+00 ... 6.500e+03 3.250e+03 2.600e+04]
 [0.000e+00 1.000e+00 0.000e+00 ... 1.060e+03 5.300e+02 4.240e+03]]


In [None]:
print(X_test)

[[0.000e+00 1.000e+00 0.000e+00 ... 1.238e+04 1.238e+04 4.333e+04]
 [1.000e+00 0.000e+00 0.000e+00 ... 1.176e+04 1.176e+04 4.116e+04]
 [1.000e+00 0.000e+00 0.000e+00 ... 9.900e+03 9.900e+03 4.455e+04]
 ...
 [0.000e+00 1.000e+00 0.000e+00 ... 9.760e+03 9.760e+03 3.904e+04]
 [1.000e+00 0.000e+00 0.000e+00 ... 1.512e+04 7.560e+03 7.560e+04]
 [1.000e+00 0.000e+00 1.000e+00 ... 4.300e+02 8.600e+02 2.580e+03]]


In [None]:
print(y_train)

310    1
768    0
177    0
418    0
190    0
      ..
875    1
185    1
552    1
515    0
790    0
Name: fraud_reported, Length: 700, dtype: int64


In [None]:
print(y_test)

583    0
897    0
714    0
754    0
377    0
      ..
396    0
26     0
879    0
974    1
925    0
Name: fraud_reported, Length: 300, dtype: int64


Train Logistic Regression Model

In [None]:
model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 0:22] = sc.fit_transform(X_train[:, 0:22])
X_test[:, 0:22] = sc.transform(X_test[:, 0:22])

Predicting a New Result

In [None]:
custom_input = pd.DataFrame([{
    'months_as_customer': 328,
    'age': 48,
    'policy_number': 521585,
    'policy_deductable': 1000,
    'policy_annual_premium': 14060.91,
    'insured_gender': 'MALE',
    'insured_education_level': 'MD',
    'capital-gains': 53300,
    'capital-loss': 0,
    'collision type': 'Side Collision',
    'incident_severity': 'Major Damage',
    'authorities_contacted': 'Police',
    'incident_hour_of_the_day': 5,
    'number_of_vehicles_involved': 1,
    'property_damage': 'YES',
    'bodily_injuries': 1,
    'witnesses': 2,
    'police_report_available': 'YES',
    'total_claim_amount' : 71610,
    'injury_claim': 6510,
    'property_claim': 13020,
    'vehicle_claim': 52080,
    'auto_make': 'Saab'
}])

custom_input_transformed = ct.transform(custom_input)
prediction = model.predict(custom_input_transformed)
print(prediction)

[0]


Predictions on the test data

In [None]:
y_pred = model.predict(X_test)

Evaluation Of the Model

In [None]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
print(accuracy)

0.7633333333333333


In [None]:
print(conf_matrix)

[[228   4]
 [ 67   1]]
