In [1]:
# %pip install seaborn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Importing Data
data = pd.read_csv("./AIML Dataset.csv")

In [4]:
data.head()

In [5]:
data.info()

In [6]:
data.columns

In [7]:
data["isFraud"].head()
data["isFraud"].value_counts()

In [8]:
data["isFlaggedFraud"].head()
data["isFlaggedFraud"].value_counts()

In [9]:
data.isnull().sum().sum()

In [10]:
data.isna().sum().sum()

In [11]:
data.shape

In [12]:
# Percentage of fraud data

print(round(float((data["isFraud"].value_counts()[1] / data.shape[0]) * 100), 2) , "%")

In [13]:
# Showcasing type of data in bar graph

data["type"].value_counts().plot(kind="bar", title="Transaction Type", color = "purple")
plt.xlabel("Transaction Type")
plt.ylabel("Count")
plt.show()

In [14]:
# fraud_by_type = data.groupby("type")
fraud_by_type = data.groupby("type")["isFraud"].mean().sort_values(ascending=False)
# print(fraud_by_type)
fraud_by_type.plot(kind="bar", title="Fraud Rate by Type", color="red")
plt.show() 

In [15]:
data["amount"].describe().astype(int)

In [16]:
# sns.histplot(np.log(data["amount"]), bins=100, color="green")
# sns.histplot(np.log1p(data["amount"]), bins=100, color="green")
sns.histplot(np.log2(data["amount"]), bins=100, color="green")
plt.show()

# for i in range(20):
#     print(
#         data["amount"][i],
#         " => ",
#         np.log(data["amount"])[i],
#         np.log1p(data["amount"])[i],
#         np.log2(data["amount"])[i],
#     )

In [17]:
sns.boxplot(data = data[data["amount"] < 200000], x = "isFraud", y="amount")
plt.title(f"Amount vs isFraud ( {data[data["amount"] < 200000]["amount"].value_counts().sum()} out of {data.shape[0]} )")
plt.show()

In [18]:
# data[data["amount"] > 181]["amount"].head()

In [19]:
data.dtypes

In [20]:
data.head()

In [21]:
# data.drop(axis=1, columns=["step", "nameOrig", "nameDest"], inplace=True)
# data.drop(axis=1, columns=["isFlaggedFraud"], inplace=True)
data.head()

In [22]:
X = data.drop("isFraud", axis=1)
Y = data["isFraud"]

# x.head()
Y.head()

In [23]:
categorical = ["type"]
numericals = ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)

In [37]:
# Preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numericals),
        ("cat", OneHotEncoder(drop="first"), categorical)
    ],
    remainder="drop"
)

In [39]:
pipeline = Pipeline([
    ("prep", preprocessor), 
    ("clf", LogisticRegression(class_weight="balanced", max_iter=1000))
    ])

In [41]:
pipeline.fit(x_train, y_train)

In [44]:
y_pred = pipeline.predict(x_test)

In [48]:
print(classification_report(y_test, y_pred))

In [52]:
print(confusion_matrix(y_test, y_pred))

In [53]:
pipeline.score(x_test, y_test)

In [54]:
# exporting 

import joblib

joblib.dump(pipeline, "fraud_detection_pipeline.pkl")