In [None]:
import pandas as pd
df = pd.read_csv("fraud_oracle.csv")

In [None]:
#removing bad age values
df = df[df['Age'] != 0]

#removing policy number and year
df = df.drop(["PolicyNumber","Year"],axis=1)

In [None]:
#setting a number value for each month and day of the week
month_to_number = {
    'January': 1, 'Jan': 1,
    'February': 2, 'Feb': 2,
    'March': 3, 'Mar': 3,
    'April': 4, 'Apr': 4,
    'May': 5,
    'June': 6, 'Jun': 6,
    'July': 7, 'Jul': 7,
    'August': 8, 'Aug': 8,
    'September': 9, 'Sep': 9,
    'October': 10, 'Oct': 10,
    'November': 11, 'Nov': 11,
    'December': 12, 'Dec': 12
}
day_to_number = {
    'Monday': 1, 'Mon': 1,
    'Tuesday': 2, 'Tue': 2,
    'Wednesday': 3, 'Wed': 3,
    'Thursday': 4, 'Thu': 4,
    'Friday': 5, 'Fri': 5,
    'Saturday': 6, 'Sat': 6,
    'Sunday': 7, 'Sun': 7
}

In [None]:
#changing the months and days values to numerical values
df['Month'] = df['Month'].map(month_to_number)
df['MonthClaimed'] = df['MonthClaimed'].map(month_to_number)
df['DayOfWeek'] = df['DayOfWeek'].map(day_to_number)
df['DayOfWeekClaimed'] = df['DayOfWeekClaimed'].map(day_to_number)

In [None]:
from sklearn.preprocessing import LabelEncoder

# List of column names to encode
columns_to_encode = ['AccidentArea','Sex','MaritalStatus','Fault','VehicleCategory','PastNumberOfClaims','PoliceReportFiled','WitnessPresent','AgentType','BasePolicy']

# LabelEncoder for each selected column
label_encoders = {}

# Encode the selected columns
for column in columns_to_encode:
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])
    label_encoders[column] = label_encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# columns to be one-hot encoded
columns_to_encode = ['Make', 'PolicyType', 'VehiclePrice','Days_Policy_Accident','Days_Policy_Claim','AgeOfVehicle','AgeOfPolicyHolder','AddressChange_Claim','NumberOfCars','NumberOfSuppliments']

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(df[['Make', 'PolicyType', 'VehiclePrice','Days_Policy_Accident','Days_Policy_Claim','AgeOfVehicle','AgeOfPolicyHolder','AddressChange_Claim','NumberOfCars','NumberOfSuppliments']])
df_encoded = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['Make', 'PolicyType', 'VehiclePrice','Days_Policy_Accident','Days_Policy_Claim','AgeOfVehicle','AgeOfPolicyHolder','AddressChange_Claim','NumberOfCars','NumberOfSuppliments']))


In [None]:
df = df.drop(['Make', 'PolicyType', 'VehiclePrice','Days_Policy_Accident','Days_Policy_Claim','AgeOfVehicle','AgeOfPolicyHolder','AddressChange_Claim','NumberOfCars','NumberOfSuppliments'],axis=1)
df = df.reset_index()
df = df.drop('index',axis=1)
df_encoded = df_encoded.reset_index()
df_encoded = df_encoded.drop('index',axis=1)

#concat both data frames
concatenated_df = pd.concat([df, df_encoded], axis=1)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, brier_score_loss
import matplotlib.pyplot as plt
from tabulate import tabulate

y = np.array(concatenated_df["FraudFound_P"])
x = concatenated_df.drop("FraudFound_P",axis=1)

random_forest = RandomForestClassifier()
decision_tree = DecisionTreeClassifier()
xgboost = xgb.XGBClassifier()
lightgbm = lgb.LGBMClassifier()


In [None]:

# Split the data into training and testing sets (adjust the test_size and random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:

smote = SMOTE(random_state=42,sampling_strategy="minority")
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_smote, y_smote)
decision_tree.fit(X_smote, y_smote)
xgboost.fit(X_smote, y_smote)
lightgbm.fit(X_smote, y_smote)

sampling_technique = "SMOTE"


In [None]:
# Create a RandomOverSampler
random_oversampler = RandomOverSampler(sampling_strategy="minority", random_state=42)

# Resample the training data using RandomOverSampler
X_resampled, y_resampled = random_oversampler.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_resampled, y_resampled)
decision_tree.fit(X_resampled, y_resampled)
xgboost.fit(X_resampled, y_resampled)
lightgbm.fit(X_resampled, y_resampled)

sampling_technique = "Random oversampler"

In [None]:

# Create an ADASYN oversampler
adasyn = ADASYN(sampling_strategy="minority", random_state=42)

# Resample the training data using ADASYN
X_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_adasyn, y_adasyn)
decision_tree.fit(X_adasyn, y_adasyn)
xgboost.fit(X_adasyn, y_adasyn)
lightgbm.fit(X_adasyn, y_adasyn)

sampling_technique = "ADASYN"


In [None]:

# Create a BorderlineSMOTE oversampler
borderline_smote = BorderlineSMOTE(sampling_strategy="minority", random_state=42)

# Resample the training data using BorderlineSMOTE
X_borderline_smote, y_borderline_smote = borderline_smote.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(X_borderline_smote, y_borderline_smote)
decision_tree.fit(X_borderline_smote, y_borderline_smote)
xgboost.fit(X_borderline_smote, y_borderline_smote)
lightgbm.fit(X_borderline_smote, y_borderline_smote)

sampling_technique = "borderline smote"


In [None]:
# Create an ADASYN oversampler
Random_undersampler = RandomUnderSampler(sampling_strategy="not majority")

# Resample the training data using ADASYN
x_random_undersampler, y_random_undersampler = Random_undersampler.fit_resample(X_train, y_train)

# Train your models on the training data
random_forest.fit(x_random_undersampler, y_random_undersampler)
decision_tree.fit(x_random_undersampler, y_random_undersampler)
xgboost.fit(x_random_undersampler, y_random_undersampler)
lightgbm.fit(x_random_undersampler, y_random_undersampler)

sampling_technique = "random undersampler"

In [None]:
import matplotlib.pyplot as plt

rf_predictions = random_forest.predict(X_test)
dt_predictions = decision_tree.predict(X_test)
xgb_predictions = xgboost.predict(X_test)
lgb_predictions = lightgbm.predict(X_test)

# Calculate F1 score, precision, recall, and accuracy for each model
rf_f1 = f1_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_brier_score = brier_score_loss(y_test, rf_predictions)

dt_f1 = f1_score(y_test, dt_predictions)
dt_precision = precision_score(y_test, dt_predictions)
dt_recall = recall_score(y_test, dt_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_brier_score = brier_score_loss(y_test, dt_predictions)

xgb_f1 = f1_score(y_test, xgb_predictions)
xgb_precision = precision_score(y_test, xgb_predictions)
xgb_recall = recall_score(y_test, xgb_predictions)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_brier_score = brier_score_loss(y_test,xgb_predictions)

lgb_f1 = f1_score(y_test, lgb_predictions)
lgb_precision = precision_score(y_test, lgb_predictions)
lgb_recall = recall_score(y_test, lgb_predictions)
lgb_accuracy = accuracy_score(y_test, lgb_predictions)
lgb_brier_score = brier_score_loss(y_test, lgb_predictions)

models = ['Random Forest', 'Decision Tree', 'XGBoost', 'LightGBM']
accuracies = [rf_accuracy, dt_accuracy, xgb_accuracy, lgb_accuracy]
f1_scores = [rf_f1, dt_f1, xgb_f1, lgb_f1]
precisions = [rf_precision, dt_precision, xgb_precision, lgb_precision]
recalls = [rf_recall, dt_recall, xgb_recall, lgb_recall]
brier_scores = [rf_brier_score,dt_brier_score,xgb_brier_score,lgb_brier_score]

# Create a list of lists for tabulate
data = []
for model, accuracy, f1, precision, recall, brier_score in zip(models, accuracies, f1_scores, precisions, recalls, brier_scores):
    data.append([model, accuracy, f1, precision, recall, brier_score ,sampling_technique])

import numpy as np

# Example data (replace with your actual data)
models = ['Random Forest', 'Decision Tree', 'XGBoost', 'LightGBM']
accuracies = [rf_accuracy, dt_accuracy, xgb_accuracy, lgb_accuracy]
f1_scores = [rf_f1, dt_f1, xgb_f1, lgb_f1]
precisions = [rf_precision, dt_precision, xgb_precision, lgb_precision]
recalls = [rf_recall, dt_recall, xgb_recall, lgb_recall]
brier_scores = [rf_brier_score, dt_brier_score, xgb_brier_score, lgb_brier_score]

# Set up the figure and axis
fig, ax = plt.subplots(figsize=(10, 6))

# Bar positions
bar_positions = np.arange(len(models))
bar_width = 0.2

# Plotting each metric for all models
bars1 = ax.bar(bar_positions - 2 * bar_width, accuracies, bar_width, label='Accuracy')
bars2 = ax.bar(bar_positions - bar_width, f1_scores, bar_width, label='F1 Score')
bars3 = ax.bar(bar_positions, precisions, bar_width, label='Precision')
bars4 = ax.bar(bar_positions + bar_width, recalls, bar_width, label='Recall')
bars5 = ax.bar(bar_positions + 2 * bar_width, brier_scores, bar_width, label='Brier Score')

# Adding labels and title
ax.set_xlabel('Models')
ax.set_ylabel('Performance Measures')
ax.set_title('Model Performance Comparison')
ax.set_xticks(bar_positions)
ax.set_xticklabels(models)
ax.legend()

# Adding value labels above each bar
def add_value_labels(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate('{}'.format(round(height, 2)),
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

add_value_labels(bars1)
add_value_labels(bars2)
add_value_labels(bars3)
add_value_labels(bars4)
add_value_labels(bars5)

# Show the plot
plt.tight_layout()
plt.show()
