load dataset
check head
check tail
describe the dataset
use aggregation like group by
find relationships using correlation matrix


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("fraud_dataset.csv")

In [None]:
data.head(10)

In [None]:
data.tail()

In [None]:
data.describe().round()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.isna().sum()

In [None]:
data.groupby("type")["amount"].mean().round()

In [None]:
plt.figure(figsize=(6,6))
sns.barplot(x='type', y='amount', data=data, palette='viridis')
plt.title("Type against amount")
plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.scatterplot(x='type', y='amount', data=data, s=100, hue='type', size='isFraud')
# plt.title("Type against amount")
# plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.histplot(data['type'], bins=2, kde=True, color='purple')
plt.title("Type against amount")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
data.head()

In [None]:
data_model = data.drop(["nameOrig", "nameDest", "isFlaggedFraud"], axis=1)

In [None]:
data_model.head()

In [None]:
categorical = ["type"]
numeric = ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]

In [None]:
y = data_model["isFraud"]
X = data_model.drop("isFraud", axis=1)

In [None]:
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size=0.3, stratify=y)

In [None]:
preprocessor = ColumnTransformer(
    transformers= [
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(drop="first"), categorical)
    ],
    remainder="drop"
)

In [None]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(class_weight="balanced", max_iter=1000))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
import joblib

joblib.dump(pipeline, "fraud_detection_pipeline.pkl")

In [None]:
random_forest_pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(
        class_weight="balanced", 
        n_estimators=100, 
        random_state=42
    ))
])

In [None]:
random_forest_pipeline.fit(X_train, y_train)

In [None]:
ry_pred = random_forest_pipeline.predict(X_test)

In [None]:
y_proba = random_forest_pipeline.predict_proba(X_test)[:, 1]


In [None]:
joblib.dump(pipeline, "Rfraud_detection_pipeline.pkl")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [None]:
rf_pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(
        class_weight="balanced",
        n_estimators=100,
        random_state=42
    ))
])

xgb_pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", XGBClassifier(
        scale_pos_weight=10,  # adjust for imbalance
        random_state=42
    ))
])

nn_pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", MLPClassifier(
        hidden_layer_sizes=(64, 32),
        max_iter=300,
        random_state=42
    ))
])

In [None]:
stacking_model = StackingClassifier(
    estimators=[
        ("rf", rf_pipeline),
        ("xgb", xgb_pipeline),
        ("nn", nn_pipeline)
    ],
    final_estimator=LogisticRegression(class_weight="balanced", max_iter=500),
    stack_method="predict_proba",  # important for fraud detection
    n_jobs=4
)


In [None]:
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)
y_proba = stacking_model.predict_proba(X_test)[:, 1]


In [6]:
import joblib

model = joblib.load("stacking_model.pkl")

print(model)


ValueError: <class 'numpy.random._mt19937.MT19937'> is not a known BitGenerator module.