<a href="https://colab.research.google.com/github/Addychauhan/Data-Analysis/blob/main/Fraud_Transaction_Prediction_using_Machine_Learning1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing required Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import accuracy_score

In [None]:
#Loading the Dataset

data=pd.read_csv('/content/Fraud.csv')
data

In [None]:
#Shape of the data

data.shape

In [None]:
#Missing Values

data.isnull().sum()

In [None]:
#Statistical Summary

data.describe()

In [None]:
#Target Variable Distribution

data['isFraud'].value_counts()

In [None]:
# Imbalanace data

data['isFraud'].value_counts(normalize=True)

In [None]:
data.shape[0]

In [None]:
round((data['isFraud'].value_counts()[1]/data.shape[0])*100,2)

# Exploratory Data Analysis(EDA)

In [None]:
#Fraud vs Non-Fraud Count

#CountPlot

sns.countplot(x='isFraud', data=data)
plt.title("Fraud vs Non-Fraud")
plt.show()

In [None]:
#Transaction Amount Distribution

#HistPlot

plt.figure(figsize=(6,4))
sns.histplot(data['amount'], bins=50)
plt.title("Transaction Amount Distribution")
plt.show()

In [None]:
sns.histplot(np.log1p(data['amount']), bins=100, kde=True, color='green')
plt.title("Transaction Amount Distribution (Log Scale)")
plt.xlabel("Log(Amount)")
plt.show()

In [None]:
#Checking Outliers

sns.boxplot(x='isFraud', y='amount', data=data)
plt.title("Fraud vs Amount")
plt.show()

In [None]:
sns.boxplot(data=data[data['amount']<50000],x='isFraud', y='amount')
plt.title("Fraud vs Amount")
plt.show()

In [None]:
#Log transformation was applied to transaction amount to reduce skewness and
# improve visualization. Fraudulent transactions show higher median values than
# non-fraud transactions.

# Log Transformation (Outlier Handling)

sns.boxplot(x='isFraud', y=np.log1p(data['amount']), data=data)
plt.title("Fraud vs Amount (Log Scale)")
plt.show()

In [None]:
# Transaction Type vs Fraud

sns.countplot(x='type', hue='isFraud', data=data)
plt.xticks(rotation=45)
plt.title("Transaction Type vs Fraud")
plt.show()

In [None]:
# Feature Engineering

data['balance_diff_orig'] = data['oldbalanceOrg'] - data['newbalanceOrig']
data['balance_diff_dest'] = data['newbalanceDest'] - data['oldbalanceDest']

In [None]:
sns.countplot(x='type', hue='isFraud', data=data)
plt.xticks(rotation=45)
plt.show()

In [None]:

data[data['balance_diff_orig']<0]['isFraud'].value_counts()

In [None]:
(data['balance_diff_orig']<0).sum()

In [None]:
(data['balance_diff_dest']<0).sum()

In [None]:
data.head(2)

In [None]:
frauds_per_step=data[data['isFraud']==1].groupby('step').size()
non_frauds_per_step=data[data['isFraud']==0].groupby('step').size()

In [None]:
frauds_per_step

In [None]:
non_frauds_per_step

In [None]:
frauds_per_step.plot(label='Fraud')
non_frauds_per_step.plot(label='Non-Fraud')
plt.legend()

In [None]:
frauds_per_step.plot(label='Fraud')

In [None]:
non_frauds_per_step.plot(label='Non-Fraud')

In [None]:
frauds_per_step=data[data['isFraud']==1]['step'].value_counts().sort_index()
plt.plot(frauds_per_step.index, frauds_per_step.values, label='Frauds_per_Step')
plt.xlabel('Step (Time)')
plt.ylabel('Number of Fraudulent Transactions')
plt.title('Number of Fraudulent Transactions per Step')
plt.grid(True)
plt.show()

In [None]:
#Drop Irrelevant Columns

data.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True)

In [None]:
#Encode Categorical Variable

data = pd.get_dummies(data, columns=['type'], drop_first=True)

In [None]:
#Correlation Check

plt.figure(figsize=(10,6))
sns.heatmap(data.corr(), cmap='coolwarm')
plt.show()

In [None]:
data

In [None]:
#Prepare Data for Modeling

X = data.drop('isFraud', axis=1)
y = data['isFraud']

In [None]:
#Train-Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Model Building

## Logistic Regression

In [None]:
#Building Machine Learning Models


from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, class_weight='balanced')

In [None]:
# Train the Model

lr.fit(X_train, y_train)

In [None]:
# Model Predictions

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

# Model Evaluation

In [None]:
#Model Evaluation(Logistic Regression) Evaluation

#Confusion Matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
#Accuracy Score

print(accuracy_score(y_test, y_pred))

In [None]:
#Classification Report

print(classification_report(y_test, y_pred))

In [None]:
#ROC-AUC Score

from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_prob)

In [None]:
#Feature Importance

importance = pd.Series(lr.coef_[0], index=X.columns)
importance.sort_values().plot(kind='barh', figsize=(8,6))
plt.title("Feature Importance (Logistic Regression)")
plt.show()

## Random Forest Classifier

In [None]:
# Random Forest Classifier


from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=10,
    max_samples=0.2,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

In [None]:
#Train the model

rf.fit(X_train, y_train)

In [None]:
# Random Forest Evaluation

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

In [None]:
#Accuracy Score

print(accuracy_score(y_test, y_pred))

In [None]:
print("Random Forest Report:")
print(classification_report(y_test, y_pred))

In [None]:
roc_auc_score(y_test, y_prob)

In [None]:
#Confusion Matrix (Random Forest)

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()

In [None]:
#Feature Importance

importance = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)


In [None]:
importance.head(10).plot(
    kind='barh',
    figsize=(8,6)
)
plt.title("Top 10 Important Features")
plt.show()

In [None]:
#Final Model Comparison

print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Random Forest ROC-AUC:", roc_auc_score(y_test, y_prob))
