<a href="https://colab.research.google.com/github/AjeethSuresh/PROJECT-1/blob/main/fraud_detection_in_financial_transaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
data=pd.read_csv('/content/mydataset.csv')

In [None]:
data.shape

(832020, 11)

In [None]:
##data description
print(data.head())
print(data.tail())

In [None]:
##data description
print(data.head())
print(data.tail())

In [None]:
#data reshaping
print(data.shape)
print(data['isFraud'].value_counts())
features=data.drop(columns='isFraud')
target= data['isFraud']
print(features.shape)
print(target.shape)
final_data=pd.concat([features,target],axis=1)
print(final_data.shape)
legit=data[data.isFraud==0]
fraud=data[data.isFraud==1]
legit=legit.sample(8213).copy()
print(legit.shape)
print(fraud.shape)


In [None]:
#new data description
print(data.head())
print(data.tail())
print(data.info())
print(data.describe())
print(data['isFraud'].value_counts())
print(data.sample(10))

In [None]:
#data merging
data=pd.concat([legit,fraud],axis=0)
print(data.shape)

(8685, 11)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
from dash import dcc, html, Input, Output
from dash.dependencies import Input, Output
import dash

In [None]:
print(data.columns)
data.head()

In [None]:
plt.style.use('_mpl-gallery')
fig,ax=plt.subplots(figsize=(4,3))
ax.hist(data['step'], edgecolor="white")
plt.xlabel('frequency')
plt.ylabel('step')
plt.title("Histogram of numerical Column")
plt.show()

In [None]:
plt.figure(figsize=(5, 3))
plt.hist(data['step'], bins=20, color='orange', edgecolor='black')
plt.title('Distribution of Transaction Durations')
plt.xlabel('Transaction Duration (Seconds)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(4,3))
data['type'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Transaction Count by Transaction Type')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.style.use('_mpl-gallery')
fig,ax=plt.subplots(figsize=(4,3))
plt.xlabel('frequency')
plt.ylabel('step')
plt.title("Barchart of Category Column")
ax.bar(data[ 'newbalanceOrig'],data['oldbalanceOrg'],width=5,edgecolor='red')
plt.show()

In [None]:
plt.figure(figsize=(4,3))
data.boxplot(column='amount', by='isFraud', figsize=(4,3))
plt.title('Box Plot of Transaction Amounts by Fraud Status')
plt.xlabel('Fraud Status')
plt.ylabel('Transaction Amount')
plt.suptitle('')
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(4,3))
sns.boxplot(
    data,x='step',y='amount',
    whis=[0, 100], width=.4, palette="vlag"
)
sns.stripplot(data,x='step',y='amount',hue='amount', size=4, color=".3")
ax.xaxis.grid(True)
ax.set_xscale('log')
ax.set(ylabel='')
plt.title('Box for pair values')
sns.despine(trim=True,left=True)

In [None]:
columns_for_pairplot = ['amount', 'oldbalanceOrg', 'newbalanceOrig']
sns.pairplot(data[columns_for_pairplot])
plt.suptitle('Pair Plot of Transaction Data', y=1.02)
plt.show()

In [None]:
columns_for_pairplot = ['amount', 'oldbalanceDest', 'newbalanceDest']

# Create the pair plot
sns.pairplot(data[columns_for_pairplot])
plt.suptitle('Pair Plot of Transaction Data', y=1.02)
plt.show()

In [None]:
fig = px.scatter(data, x='oldbalanceOrg', y='oldbalanceDest', color='amount', title='Interactive Scatter Plot of Transaction Data',
                 labels={'amount':'Amount', 'Step': 'Steps'})

# Update layout for better readability
fig.update_layout(
    xaxis_title='Old Balance Orgin',
    yaxis_title='Old balance Destination',
    legend_title='Fraud',
    showlegend=True
)

# Show the plot
fig.show()

In [None]:
fig = px.scatter(data, x='newbalanceOrig', y='newbalanceDest', color='amount', title='Interactive Scatter Plot of Transaction Data',
                 labels={'amount':'Amount', 'Step': 'Steps'})

# Update layout for better readability
fig.update_layout(
    xaxis_title='New Balance Orgin',
    yaxis_title='New balance Destination',
    legend_title='Fraud',
    showlegend=True
)

# Show the plot
fig.show()

In [None]:
app = dash.Dash(__name__)

# Define the layout of the dashboard
app.layout = html.Div([
    html.H1("Interactive Dashboard for Transaction Data"),
    html.Div([
        dcc.Graph(id='scatter-plot', figure={}),
    ]),
    html.Div([
        dcc.Graph(id='bar-chart', figure={}),
    ])
])

# Callback to update scatter plot based on user input
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('bar-chart', 'hoverData')])
def update_scatter_plot(hoverData):
    if hoverData is None:
        filtered_df = data
    else:
        merchant_category = hoverData['points'][0]['x']
        filtered_df = data[data['type'] == merchant_category]

    fig = px.scatter(filtered_df, x='oldbalanceOrg', y='newbalanceOrig', color='isFraud',
                     size='amount', hover_data=['nameOrig', 'nameDest'],
                     title='Interactive Scatter Plot of Transaction Data',
                     labels={'oldbalanceOrg': 'Old Balance', 'newbalanceOrig': 'New Balance', 'amount': 'Amount'})
    fig.update_layout(xaxis_title='Old Balance', yaxis_title='New Balance', showlegend=True)
    return fig

# Callback to update bar chart based on user input
@app.callback(
    Output('bar-chart', 'figure'),
    [Input('scatter-plot', 'hoverData')])
def update_bar_chart(hoverData):
    if hoverData is None:
        return {}
    else:
        merchant_category = hoverData['points'][0]['hovertext']
        fraud_counts = data[data['type'] == merchant_category]['isFraud'].value_counts()

        fig = px.bar(x=['Non-Fraudulent', 'Fraudulent'], y=fraud_counts.values,
                     labels={'x': 'Fraud Status', 'y': 'Count'},
                     color=['Non-Fraudulent', 'Fraudulent'], title='Fraudulent vs. Non-Fraudulent Transactions',
                     color_discrete_map={'Non-Fraudulent': 'lightgreen', 'Fraudulent': 'salmon'})
        return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)

In [None]:

app = dash.Dash(__name__)
app.layout = html.Div([
    html.H1("Interactive Dashboard for Financial Transactions"),
    html.Div([
        dcc.Dropdown(
            id='type-dropdown',
            options=[{'label': i, 'value': i} for i in data['type'].unique()],
            value='PAYMENT',
            multi=True
        ),
    ]),
    dcc.Graph(id='scatter-plot'),
])

@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('type-dropdown', 'value')]
)
def update_scatter_plot(selected_types):
    filtered_data = data[data['type'].isin(selected_types)]
    fig = px.scatter(filtered_data, x='oldbalanceOrg', y='newbalanceOrig', color='isFraud',
                     hover_data=['type', 'amount'], title='Transaction Details',
                     labels={'oldbalanceOrg': 'Old Balance', 'newbalanceOrig': 'New Balance'})

    fig.update_layout(xaxis_title='Old Balance', yaxis_title='New Balance', showlegend=True)
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)

In [None]:

from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
type_new = pd.get_dummies(data['type'], drop_first=True)
data_new = pd.concat([data, type_new], axis=1)

In [None]:
X = data_new.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis=1)
y = data_new['isFraud']
X.shape, y.shape


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.3, random_state=42)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score as ras
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:


models = [LogisticRegression(), XGBClassifier(),
		SVC(kernel='rbf', probability=True),
		RandomForestClassifier(n_estimators=7,
								criterion='entropy',
								random_state=7)]

for i in range(len(models)):
	models[i].fit(X_train, y_train)
	print(f'{models[i]} : ')

	train_preds = models[i].predict_proba(X_train)[:, 1]
	print('Training Accuracy : ', ras(y_train, train_preds))

	y_preds = models[i].predict_proba(X_test)[:, 1]
	print('Validation Accuracy : ', ras(y_test, y_preds))
	print()

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score as ras, accuracy_score, precision_recall_curve, auc, average_precision_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics.pairwise import pairwise_distances

# Convert y_test to numpy array if it's not already
y_test = np.array(y_test)

# Fit the models
for i in range(len(models)):
    models[i].fit(X_train, y_train)
    print(f'{models[i]} : ')

    # Training predictions
    train_preds = models[i].predict_proba(X_train)[:, 1]
    print('Training ROC-AUC : ', ras(y_train, train_preds))
    print('Training Accuracy : ', accuracy_score(y_train, models[i].predict(X_train)))

    # Validation predictions
    y_preds = models[i].predict_proba(X_test)[:, 1]
    print('Validation ROC-AUC : ', ras(y_test, y_preds))
    print('Validation Accuracy : ', accuracy_score(y_test, models[i].predict(X_test)))

    # Ranking metrics
    precision, recall, _ = precision_recall_curve(y_test, y_preds)
    pr_auc = auc(recall, precision)
    avg_precision = average_precision_score(y_test, y_preds)
    print('Validation Precision-Recall AUC: ', pr_auc)
    print('Validation Average Precision Score: ', avg_precision)

    # Diversity metrics
    if i > 0:
        diversity = 1 - np.mean(pairwise_distances(preds_stack, metric='correlation'))
        print('Diversity (Correlation-based): ', diversity)

    # Stack predictions for diversity calculation
    if i == 0:
        preds_stack = y_preds.reshape(-1, 1)
    else:
        preds_stack = np.hstack((preds_stack, y_preds.reshape(-1, 1)))

    sorted_indices = np.argsort(y_preds)[::-1]
    sorted_true_labels = y_test[sorted_indices]
    avg_rank_true_positives = np.mean(np.where(sorted_true_labels == 1)[0])
    print('Average Rank of True Positives: ', avg_rank_true_positives)

    # Confusion matrix
    disp = ConfusionMatrixDisplay.from_estimator(models[i], X_test, y_test)
    disp.plot()
    plt.show()

    print()
