**Fraud Detection in Insurance Claims Detect fraudulent insurance claims using machine learning. Implement a fraud detection model using decision trees or neural networks. Set up a database to store claim data. Create a web interface to input claim data and display fraud detection results. Use the Insurance Company Benchmark (COIL 2000) Dataset.**

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


####Importing libraries and loading data set

In [2]:
import pandas as pd

# Load the dataset
data_path = '/content/drive/MyDrive/fraud_data/tic_2000_train_data (2).csv'
data = pd.read_csv(data_path)

# Display the first few rows of the dataset
print(data.head())



   MOSTYPE  MAANTHUI  MGEMOMV  MGEMLEEF  MOSHOOFD  MGODRK  MGODPR  MGODOV  \
0       33         1        3         2         8       0       5       1   
1       37         1        2         2         8       1       4       1   
2       37         1        2         2         8       0       4       2   
3        9         1        3         3         3       2       3       2   
4       40         1        4         2        10       1       4       1   

   MGODGE  MRELGE  ...  AGEZONG  AWAOREG  ABRAND  AZEILPL  APLEZIER  AFIETS  \
0       3       7  ...        0        0       1        0         0       0   
1       4       6  ...        0        0       1        0         0       0   
2       4       3  ...        0        0       1        0         0       0   
3       4       5  ...        0        0       1        0         0       0   
4       4       7  ...        0        0       1        0         0       0   

   AINBOED  ABYSTAND  CARAVAN  fraud  
0        0         0   

####Data Preprocessing and otlier detection

In [3]:
from sklearn.preprocessing import LabelEncoder

# Check for missing values
print(data.isnull().sum())

# Fill missing values or drop missing values
# Here, we fill missing values with the median of the column
data = data.fillna(data.median())

# Encode categorical variables using LabelEncoder
for column in data.columns:
    if data[column].dtype == 'object':
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])

# Separate features and target variable
X = data.drop('fraud', axis=1)  # 'fraud' is the target variable
y = data['fraud']

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


MOSTYPE     0
MAANTHUI    0
MGEMOMV     0
MGEMLEEF    0
MOSHOOFD    0
           ..
AFIETS      0
AINBOED     0
ABYSTAND    0
CARAVAN     0
fraud       0
Length: 87, dtype: int64
Shape of X: (4000, 86)
Shape of y: (4000,)


Claculating the count of Fraud Class

In [4]:
fraud_class = {0:'Not Fraud', 1:'Fraud'}
print(data.fraud.value_counts().rename(index = fraud_class))

fraud
Not Fraud    3762
Fraud         238
Name: count, dtype: int64


####Selecting top Features

In [None]:
all_features = data[]

In [5]:
from sklearn.feature_selection import SelectKBest, chi2
import pandas as pd

# Apply SelectKBest to extract top 20 best features
bestfeatures = SelectKBest(score_func=chi2, k=15)
fit = bestfeatures.fit(X, y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# Concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  # Naming the dataframe columns
top_features = featureScores.nlargest(15, 'Score')['Specs'].values

print("Top Features:")
print(top_features)



Top Features:
['PPERSAUT' 'MAUT2' 'PWAOREG' 'MINK123M' 'MOSTYPE' 'MBERBOER' 'PWERKT'
 'PBRAND' 'MAUT1' 'PPERSONG' 'PVRAAUT' 'MBERMIDD' 'PBROM' 'AWERKT'
 'ABRAND']


####Training the Decision tree Classification model

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Select the top features
X_top = X[top_features]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Train the Decision Tree classifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# Make predictions
y_pred = dtree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.8725
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.92      0.93       760
           1       0.02      0.03      0.02        40

    accuracy                           0.87       800
   macro avg       0.48      0.47      0.48       800
weighted avg       0.90      0.87      0.89       800



Improving the decisionTreeClassifier Accuracy

In [7]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Predict using the best model
best_dtree = grid_search.best_estimator_
y_pred = best_dtree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters found:  {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Accuracy: 0.9425
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       760
           1       0.00      0.00      0.00        40

    accuracy                           0.94       800
   macro avg       0.47      0.50      0.49       800
weighted avg       0.90      0.94      0.92       800



In [8]:
import joblib

model_path = 'decision_tree_model.pkl'
joblib.dump(best_dtree, model_path)
print("Model saved to", model_path)


Model saved to decision_tree_model.pkl


####Database setup

In [9]:
import sqlite3

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('claims_data.db')
cursor = conn.cursor()

# Create a table to store claims data
cursor.execute('''
CREATE TABLE IF NOT EXISTS claims (
    id INTEGER PRIMARY KEY,
    PPERSAUT INTEGER,
    MAUT2 INTEGER,
    PWAOREG INTEGER,
    MINK123M INTEGER,
    MOSTYPE INTEGER,
    MBERBOER INTEGER,
    PWERKT INTEGER,
    PBRAND INTEGER,
    MAUT1 INTEGER,
    PPERSONG INTEGER,
    PVRAAUT INTEGER,
    MBERMIDD INTEGER,
    PBROM INTEGER,
    AWERKT INTEGER,
    ABRAND INTEGER,
    prediction TEXT
)
''')

conn.commit()
conn.close()



####Implementing Flask Application


In [10]:
!pip install pyngrok



Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [11]:
from pyngrok import ngrok

ngrok.set_auth_token("2hnyVKjmoFzGwqMpib10EwM7Ody_7LTgqoNwiN5VQ47PV4X4n")




In [21]:
from flask import Flask, request, render_template, jsonify
import pandas as pd
import sqlite3
import csv
import joblib
from pyngrok import ngrok

app = Flask(__name__, template_folder='/content/drive/MyDrive/fraud_data/templates')

# Path to your model file in Google Drive
model_path = '/content/drive/MyDrive/fraud_data/decision_tree_model.pkl'

# Load the trained model
model = joblib.load(model_path)

# Function to predict fraud
def predict_fraud(input_data):
    input_df = pd.DataFrame([input_data])
    prediction = model.predict(input_df)
    return 'Fraud' if prediction[0] == 1 else 'Not Fraud'

# Home route to display the form
@app.route('/')
def home():
    return render_template('index.html')

# Route to handle form submission
@app.route('/predict', methods=['POST'])
def predict():
    input_data = request.form.to_dict()

    # Convert all form values to integers
    input_data = {k: int(v) for k, v in input_data.items()}

    prediction = predict_fraud(input_data)

    # Save the input data and prediction to the database
    conn = sqlite3.connect('/content/drive/MyDrive/claims_data (1).db')
    c = conn.cursor()
    c.execute('''
        CREATE TABLE IF NOT EXISTS claims (
            PPERSAUT INTEGER,
            MAUT2 INTEGER,
            PWAOREG INTEGER,
            MINK123M INTEGER,
            MOSTYPE INTEGER,
            MBERBOER INTEGER,
            PWERKT INTEGER,
            PBRAND INTEGER,
            MAUT1 INTEGER,
            PPERSONG INTEGER,
            PVRAAUT INTEGER,
            MBERMIDD INTEGER,
            PBROM INTEGER,
            AWERKT INTEGER,
            ABRAND INTEGER,
            prediction TEXT
        )
    ''')
    c.execute('''
        INSERT INTO claims (PPERSAUT, MAUT2, PWAOREG, MINK123M, MOSTYPE, MBERBOER, PWERKT, PBRAND, MAUT1, PPERSONG, PVRAAUT, MBERMIDD, PBROM, AWERKT, ABRAND, prediction)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (input_data['PPERSAUT'], input_data['MAUT2'], input_data['PWAOREG'], input_data['MINK123M'], input_data['MOSTYPE'], input_data['MBERBOER'], input_data['PWERKT'], input_data['PBRAND'], input_data['MAUT1'], input_data['PPERSONG'], input_data['PVRAAUT'], input_data['MBERMIDD'], input_data['PBROM'], input_data['AWERKT'], input_data['ABRAND'], prediction))
    conn.commit()
    conn.close()

    # Save the input data and prediction to a CSV file
    with open('/content/drive/MyDrive/fraud_data/claims_data.csv', mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([input_data['PPERSAUT'], input_data['MAUT2'], input_data['PWAOREG'], input_data['MINK123M'], input_data['MOSTYPE'], input_data['MBERBOER'], input_data['PWERKT'], input_data['PBRAND'], input_data['MAUT1'], input_data['PPERSONG'], input_data['PVRAAUT'], input_data['MBERMIDD'], input_data['PBROM'], input_data['AWERKT'], input_data['ABRAND'], prediction])

    return render_template('result.html', prediction=prediction)

if __name__ == "__main__":
    # Start ngrok tunnel
    public_url = ngrok.connect(5000)
    print(" * Tunnel URL:", public_url)

    app.run()



 * Tunnel URL: NgrokTunnel: "https://7c24-34-81-52-75.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [26/Jun/2024 05:29:42] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [26/Jun/2024 05:29:43] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [26/Jun/2024 05:30:29] "POST /predict HTTP/1.1" 200 -


In [None]:
!python your_flask_app_file.py
