In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

: 

In [None]:
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")


In [None]:
df = pd.read_csv("AIML Dataset.csv")
df

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df["isFraud"].value_counts()

In [None]:
df["isFlaggedFraud"].value_counts()

In [None]:
df.isnull().sum().sum()

In [None]:
df.shape[0]

In [None]:
round((df["isFraud"].value_counts()[1]/df.shape[0]) * 100,2)

In [None]:
df["type"].value_counts().plot(kind="bar",title="Transaction Types", color="skyblue")
plt.xlabel("Transaction Types")
plt.ylabel("Counts")

In [None]:
fraud_by_type = df.groupby("type")["isFraud"].mean().sort_values(ascending=False)
fraud_by_type.plot(kind="bar",title="Fraud Rate by Type", color = "salmon")
plt.ylabel("Fraud Rate")
plt.show()

In [None]:
df["amount"].describe().astype(int)

In [None]:
sns.histplot(np.log1p(df["amount"]), bins=100, kde=True, color="green")
plt.title("Transaction Amount Distribution (log scaled)")
plt.xlabel("Log(Amount + 1)")
plt.show()


In [None]:
sns.boxplot(data = df[df["amount"] < 50000], x = "isFraud", y = "amount")
plt.title("Amount vs isFraud (Filtered under 50k)")
plt.show()

In [None]:
df["balanceDiffOrig"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["balanceDiffDest"] = df["newbalanceDest"] - df["oldbalanceDest"]

In [None]:
(df["balanceDiffOrig"] < 0).sum()

In [None]:
(df["balanceDiffDest"] < 0).sum()

In [None]:
df.head(2)

In [None]:
frauds_per_step = df[df["isFraud"] == 1 ] ["step"].value_counts().sort_index()
plt.plot(frauds_per_step.index, frauds_per_step.values, label="Frauds per step")
plt.xlabel("Step (Time)")
plt.ylabel("Number of Frauds")
plt.title("Frauds over time")
plt.grid(True)
plt.show()

In [None]:
df.head()

In [None]:
top_senders = df["nameOrig"].value_counts().head(10)
top_senders

In [None]:
top_receivers = df["nameDest"].value_counts().head(10)
top_receivers

In [None]:
fraud_users = df[df["isFraud"]==1]["nameOrig"].value_counts().head(10)
fraud_users

In [None]:
fraud_types = df[df["type"].isin(["TRANSFER","CASH_OUT"])]
fraud_types["type"].value_counts()

In [None]:
sns.countplot(data=fraud_types, x="type", hue="isFraud")
plt.title("Fraud Distribution in Transfer & Cash_Out")
plt.show()

In [None]:
corr = df[[
    "amount",
    "oldbalanceOrg",     # âœ… corrected name
    "newbalanceOrig",
    "oldbalanceDest",
    "newbalanceDest",
    "isFraud"
]].corr()

print(corr)



In [None]:
sns.heatmap(corr, annot = True,cmap = "coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
zero_after_transfer = df[
    (df["oldbalanceOrg"] > 0) &
    (df["newbalanceOrig"] == 0) &
    (df["type"].isin(["TRANSFER","CASH_OUT"]))
]

In [None]:
len(zero_after_transfer)

In [None]:
zero_after_transfer.head()

In [None]:
df["isFraud"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
df.head()

In [None]:
df_model = df.drop(["nameOrig","nameDest","isFlaggedFraud"], axis =1)

In [None]:
df_model.head()

In [None]:
categorical = ["type"]
numeric = ["amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"]

In [None]:
y = df_model["isFraud"]
x = df_model.drop("isFraud", axis = 1)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3, stratify=y)

In [None]:
preprocessor = ColumnTransformer(
    transformers= [
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(drop="first"), categorical)
    ],
    remainder="drop"
)

In [None]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(class_weight="balanced", max_iter=1000))
])

In [None]:
pipeline.fit(x_train,y_train)

In [None]:
pipeline.predict(x_test)

In [None]:
y_pred = pipeline.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
pipeline.score(x_test,y_test) * 100

In [None]:
import joblib

joblib.dump(pipeline,"fraud_detection_pipeline.pkl")

In [None]:
y_pred