In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
import warnings
warnings.filterwarnings("ignore")
sns.set(style='whitegrid')

In [3]:
df = pd.read_csv("AIML Dataset.csv")

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [1]:
df.columns

NameError: name 'df' is not defined

In [None]:
df("isFraud").value_counts()

In [None]:
df("isFlaggedFraud").value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.shape()

In [None]:
(df("is_fraud").value_counts()[1]/df.shape[0]) * 100

In [None]:
df("type").value_counts().plot(kind='bar', title= "Transaction types", color='skyblue')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.show()

In [None]:
fraud_by_type = df.groupby("type")["isFraud"].mean().sort_values(ascending=False)
fraud_by_type.plot(kind='bar', title='Fraud Rate by Transaction Type', color='salmon')
plt.ylabel('Fraud Rate')
plt.show()

In [None]:
df["amount"].describe().astype(int)

In [None]:
sns.histplot(np.log1p(df["amount"]), bins=50, kde=True , color='purple')
plt.title("transaction amount distribution {log scale}")
plt.xlabel("Log(Amount + 1)")
plt.show()

In [None]:
sns.boxplot(data= df[df["amount"] < 50,000], x= "isFraud", y= "amount")
plt.title("amount vs isFroud(filtered under 50k)")
plt.show()

In [None]:
df["balanceDiffOrig"] = df["oldbalancceOrg"] - df["newbalanceOrg"]
df["balanceDiffDest"] = df["newbalanceDest"] -  df["oldbalanceDest"]

In [None]:
(df["balanceDiffOrig"] < 0).sum()

In [None]:
(df["balanceDiffDest"] < 0).sum()

In [None]:
df.head(2)

In [None]:
frauds_per_step = df[df["isFraud"] == 1] ["step"].value_counts().sort_index()
plt.plot(frauds_per_step.index, frauds_per_step.values, label="Frauds per step")
plt.xlabel("Step (Time)")
plt.ylabel("Number of frouds")
plt.title("Frouds over time")
plt.grid(True)
plt.show()

In [None]:
df.drop(columns="step", inplace=True)

In [None]:
df.head()

In [None]:
top_senders = df["nameOrig"].value_counts().head()

In [None]:
top_senders

In [None]:
top_recievers = df["nameDest"].value_counts().head()

In [None]:
top_recievers

In [None]:
fraud_users = df[df["isFraud"] == 1]["nameOrig"].value_counts().head(10)

In [None]:
fraud_users

In [None]:
fraud_types = df[df["type"].isin(["TRANSFER", "CASH_OUT"])]

In [None]:
fraud_types["type"].value_counts()

In [None]:
sns.countplot(data= fraud_types, x="type", hue="isFraud")
plt.title("Froud of Destribution in TRANSFER and CASH_OUT")
plt.show()

In [None]:
corr = df[["amount"], "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "isFraud"].corr()

In [None]:
corr

In [None]:
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Corrletion matrix")
plt.show()

In [None]:
zero_after_transfer = df [
    (df["oldbalanceOrg"] > 0)&
    (df["newbalanceOrig"]==0)&
    (df["type"].isin(["TRANSFER", "CASH_OUT"]))
]

In [None]:
len(zero_after_transfer)

In [None]:
zero_after_transfer

In [None]:
df["isFraud"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
df.head()

In [None]:
df_model = df.drop(["nameOrig", "nameDest", "isFlaggedFraud"], axis= 1)

In [None]:
df_model.head()

In [None]:
catagorical = ["type"]
numeric = ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]

In [None]:
Y = df_model["isFraud"]
X = df.model.drop("isFraud", axis= 1)


In [None]:
X_train, X_test, Y__train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(drop="first"), catagorical)
    ],
    remainder="drop"
)

In [None]:
Pipeline = Pipeline(
    [
        ("prep", preprocessor),
        ("clf", LogisticRegression(class_weight="balanced", max_iter= 1000))
    ]
)

In [None]:
Pipeline.fit(X_train, Y__train)

In [None]:
y_pred = Pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
Pipeline.score(X_test, y_test)

In [None]:
import joblib 
joblib.dump(Pipeline, "fraud_detection_pipline.pkl")