In [1]:
import pandas as pd
df = pd.read_csv("fraud_oracle.csv")

In [2]:
#removing bad age values
df = df[df['Age'] != 0]

#removing policy number and year
df = df.drop(["PolicyNumber","Year"],axis=1)

In [3]:
#setting a number value for each month and day of the week
month_to_number = {
    'January': 1, 'Jan': 1,
    'February': 2, 'Feb': 2,
    'March': 3, 'Mar': 3,
    'April': 4, 'Apr': 4,
    'May': 5,
    'June': 6, 'Jun': 6,
    'July': 7, 'Jul': 7,
    'August': 8, 'Aug': 8,
    'September': 9, 'Sep': 9,
    'October': 10, 'Oct': 10,
    'November': 11, 'Nov': 11,
    'December': 12, 'Dec': 12
}
day_to_number = {
    'Monday': 1, 'Mon': 1,
    'Tuesday': 2, 'Tue': 2,
    'Wednesday': 3, 'Wed': 3,
    'Thursday': 4, 'Thu': 4,
    'Friday': 5, 'Fri': 5,
    'Saturday': 6, 'Sat': 6,
    'Sunday': 7, 'Sun': 7
}

In [4]:
#changing the months and days values to numerical values
df['Month'] = df['Month'].map(month_to_number)
df['MonthClaimed'] = df['MonthClaimed'].map(month_to_number)
df['DayOfWeek'] = df['DayOfWeek'].map(day_to_number)
df['DayOfWeekClaimed'] = df['DayOfWeekClaimed'].map(day_to_number)

In [5]:
from sklearn.preprocessing import LabelEncoder

# List of column names to encode
columns_to_encode = ['AccidentArea','Sex','MaritalStatus','Fault','VehicleCategory','PastNumberOfClaims','PoliceReportFiled','WitnessPresent','AgentType','BasePolicy']

# LabelEncoder for each selected column
label_encoders = {}

# Encode the selected columns
for column in columns_to_encode:
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])
    label_encoders[column] = label_encoder

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# columns to be one-hot encoded
columns_to_encode = ['Make', 'PolicyType', 'VehiclePrice','Days_Policy_Accident','Days_Policy_Claim','AgeOfVehicle','AgeOfPolicyHolder','AddressChange_Claim','NumberOfCars','NumberOfSuppliments']

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(df[['Make', 'PolicyType', 'VehiclePrice','Days_Policy_Accident','Days_Policy_Claim','AgeOfVehicle','AgeOfPolicyHolder','AddressChange_Claim','NumberOfCars','NumberOfSuppliments']])
df_encoded = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['Make', 'PolicyType', 'VehiclePrice','Days_Policy_Accident','Days_Policy_Claim','AgeOfVehicle','AgeOfPolicyHolder','AddressChange_Claim','NumberOfCars','NumberOfSuppliments']))


In [7]:
df = df.drop(['Make', 'PolicyType', 'VehiclePrice','Days_Policy_Accident','Days_Policy_Claim','AgeOfVehicle','AgeOfPolicyHolder','AddressChange_Claim','NumberOfCars','NumberOfSuppliments'],axis=1)
df = df.reset_index()
df = df.drop('index',axis=1)
df_encoded = df_encoded.reset_index()
df_encoded = df_encoded.drop('index',axis=1)

#concat both data frames
concatenated_df = pd.concat([df, df_encoded], axis=1)

In [35]:
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
# Import and initialize your classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tabulate import tabulate

y = np.array(concatenated_df["FraudFound_P"])
x = concatenated_df.drop("FraudFound_P",axis=1)

random_forest = RandomForestClassifier()
decision_tree = DecisionTreeClassifier()
xgboost = xgb.XGBClassifier()
lightgbm = lgb.LGBMClassifier()


In [38]:

# Split the data into training and testing sets (adjust the test_size and random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42,sampling_strategy="minority")
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_smote, y_smote)
decision_tree.fit(X_smote, y_smote)
xgboost.fit(X_smote, y_smote)
lightgbm.fit(X_smote, y_smote)

sampling_technique = "SMOTE"


[LightGBM] [Info] Number of positive: 11369, number of negative: 11369
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10591
[LightGBM] [Info] Number of data points in the train set: 22738, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [17]:
# Create a RandomOverSampler
random_oversampler = RandomOverSampler(sampling_strategy="minority", random_state=42)

# Resample the training data using RandomOverSampler
X_resampled, y_resampled = random_oversampler.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_resampled, y_resampled)
decision_tree.fit(X_resampled, y_resampled)
xgboost.fit(X_resampled, y_resampled)
lightgbm.fit(X_resampled, y_resampled)

sampling_technique = "Random oversampler"

[LightGBM] [Info] Number of positive: 11369, number of negative: 11369
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003287 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 299
[LightGBM] [Info] Number of data points in the train set: 22738, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [19]:
# Split the data into training and testing sets (adjust the test_size and random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create an ADASYN oversampler
adasyn = ADASYN(sampling_strategy="minority", random_state=42)

# Resample the training data using ADASYN
X_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_adasyn, y_adasyn)
decision_tree.fit(X_adasyn, y_adasyn)
xgboost.fit(X_adasyn, y_adasyn)
lightgbm.fit(X_adasyn, y_adasyn)

sampling_technique = "ADASYN"


[LightGBM] [Info] Number of positive: 11193, number of negative: 11369
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006922 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10503
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496100 -> initscore=-0.015602
[LightGBM] [Info] Start training from score -0.015602


In [21]:
# Split the data into training and testing sets (adjust the test_size and random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create a BorderlineSMOTE oversampler
borderline_smote = BorderlineSMOTE(sampling_strategy="minority", random_state=42)

# Resample the training data using BorderlineSMOTE
X_borderline_smote, y_borderline_smote = borderline_smote.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_borderline_smote, y_borderline_smote)
decision_tree.fit(X_borderline_smote, y_borderline_smote)
xgboost.fit(X_borderline_smote, y_borderline_smote)
lightgbm.fit(X_borderline_smote, y_borderline_smote)

sampling_technique = "borderline smote"


[LightGBM] [Info] Number of positive: 11369, number of negative: 11369
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10621
[LightGBM] [Info] Number of data points in the train set: 22738, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [33]:
# Split the data into training and testing sets (adjust the test_size and random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create an ADASYN oversampler
Tomek_Link = TomekLinks(sampling_strategy="minority")

# Resample the training data using ADASYN
X_TomekLink, y_TomekLink = adasyn.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_TomekLink, y_TomekLink)
decision_tree.fit(X_TomekLink, y_TomekLink)
xgboost.fit(X_TomekLink, y_TomekLink)
lightgbm.fit(X_TomekLink, y_TomekLink)

sampling_technique = "TomekLink"

[LightGBM] [Info] Number of positive: 11193, number of negative: 11369
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10503
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496100 -> initscore=-0.015602
[LightGBM] [Info] Start training from score -0.015602


In [40]:
# Split the data into training and testing sets (adjust the test_size and random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create an ADASYN oversampler
Random_undersampler = RandomUnderSampler(sampling_strategy="minority")

# Resample the training data using ADASYN
x_random_undersampler, y_random_undersampler = adasyn.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(x_random_undersampler, y_random_undersampler)
decision_tree.fit(x_random_undersampler, y_random_undersampler)
xgboost.fit(x_random_undersampler, y_random_undersampler)
lightgbm.fit(x_random_undersampler, y_random_undersampler)

sampling_technique = "random undersampler"

[LightGBM] [Info] Number of positive: 11193, number of negative: 11369
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10503
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496100 -> initscore=-0.015602
[LightGBM] [Info] Start training from score -0.015602


In [41]:
rf_predictions = random_forest.predict(X_test)
dt_predictions = decision_tree.predict(X_test)
xgb_predictions = xgboost.predict(X_test)
lgb_predictions = lightgbm.predict(X_test)

# Calculate F1 score, precision, recall, and accuracy for each model
rf_f1 = f1_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)

dt_f1 = f1_score(y_test, dt_predictions)
dt_precision = precision_score(y_test, dt_predictions)
dt_recall = recall_score(y_test, dt_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)

xgb_f1 = f1_score(y_test, xgb_predictions)
xgb_precision = precision_score(y_test, xgb_predictions)
xgb_recall = recall_score(y_test, xgb_predictions)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

lgb_f1 = f1_score(y_test, lgb_predictions)
lgb_precision = precision_score(y_test, lgb_predictions)
lgb_recall = recall_score(y_test, lgb_predictions)
lgb_accuracy = accuracy_score(y_test, lgb_predictions)

models = ['Random Forest', 'Decision Tree', 'XGBoost', 'LightGBM']
accuracies = [rf_accuracy, dt_accuracy, xgb_accuracy, lgb_accuracy]
f1_scores = [rf_f1, dt_f1, xgb_f1, lgb_f1]
precisions = [rf_precision, dt_precision, xgb_precision, lgb_precision]
recalls = [rf_recall, dt_recall, xgb_recall, lgb_recall]

# Create a list of lists for tabulate
data = []
for model, accuracy, f1, precision, recall in zip(models, accuracies, f1_scores, precisions, recalls):
    data.append([model, accuracy, f1, precision, recall, sampling_technique])

# Define the headers
headers = ["Model", "Accuracy", "F1 Score", "Precision", "Recall", "sampling technique"]

# Create and print the table
table = tabulate(data, headers, tablefmt="pretty")
print(table)

+---------------+--------------------+---------------------+---------------------+-----------------------+---------------------+
|     Model     |      Accuracy      |      F1 Score       |      Precision      |        Recall         | sampling technique  |
+---------------+--------------------+---------------------+---------------------+-----------------------+---------------------+
| Random Forest | 0.9400662251655629 | 0.01092896174863388 |         0.5         | 0.0055248618784530384 | random undersampler |
| Decision Tree | 0.8887417218543047 | 0.16831683168316833 | 0.15246636771300448 |  0.1878453038674033   | random undersampler |
|    XGBoost    | 0.937748344370861  | 0.12149532710280374 | 0.3939393939393939  |  0.0718232044198895   | random undersampler |
|   LightGBM    | 0.9400662251655629 | 0.08121827411167512 |         0.5         |  0.04419889502762431  | random undersampler |
+---------------+--------------------+---------------------+---------------------+---------------