In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score as ras
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score, f1_score
from sklearn.utils import resample

In [None]:
df = pd.read_csv('onlinefraud.csv')
df.head()


In [None]:


df.info()



In [None]:
df.describe()

In [None]:
df = df.iloc[:,:-1]



In [None]:
obj = (df.dtypes == 'object')
object_cols = list(obj[obj].index)
print("Categorical variables:", len(object_cols))

int_ = (df.dtypes == 'int64')
int_cols = list(int_[int_].index)
print("Integer variables:", len(int_cols))

fl = (df.dtypes == 'float')
float_cols = list(fl[fl].index)
print("Float variables:", len(float_cols))




In [None]:
sns.countplot(x='type', df=df)

In [None]:


df['isFraud'].value_counts()



In [None]:


sns.barplot(x='type', y='amount', data=df)



In [None]:
plt.figure(figsize=(15, 6))
sns.displot(df['step'], bins=50)

In [None]:
numeric_data = df.select_dtypes(include=['number'])
plt.figure(figsize=(12, 6))
sns.heatmap(numeric_data.corr(),
			cmap='BrBG',
			fmt='.2f',
			linewidths=2,
			annot=True)

In [None]:
type_new = pd.get_dummies(df['type'], drop_first=True)
data_new = pd.concat([df, type_new], axis=1)
data_new.head()

In [None]:


X = data_new.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis=1)
y = data_new['isFraud']



In [None]:
X.shape, y.shape

In [None]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.3, random_state=42)

In [None]:
models = [LogisticRegression(), XGBClassifier(),
		RandomForestClassifier(n_estimators=7,
								criterion='entropy',
								random_state=7)]

for i in range(len(models)):
	models[i].fit(X_train, y_train)
	print(f'{models[i]} : ')
	
	train_preds = models[i].predict_proba(X_train)[:, 1]
	print('Training Accuracy : ', ras(y_train, train_preds))
	
	y_preds = models[i].predict_proba(X_test)[:, 1]
	print('Validation Accuracy : ', ras(y_test, y_preds))
	print()

In [None]:
y_pred = models[1].predict(X_test)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
recall = recall_score(y_test, y_pred)
recall

In [None]:


f1 = f1_score(y_test, y_pred)
f1



In [None]:
roc_auc = ras(y_test, y_pred)
roc_auc

In [None]:
con_data = pd.concat([X, y], axis=1)
non_fraud = con_data[con_data['isFraud'] == 0]
fraud = con_data[con_data['isFraud'] == 1]

num_samples = min(len(non_fraud), len(fraud))
non_fraud_undersampled = resample(non_fraud, replace=False, 
                                  n_samples=num_samples, random_state=42)

balanced_df = pd.concat([non_fraud_undersampled, fraud])

balanced_df = balanced_df.sample(frac=1, random_state=42)

In [None]:
balanced_df['isFraud'].value_counts()

In [None]:
X = balanced_df.drop(['isFraud'], axis=1)
y = balanced_df['isFraud']

In [None]:


y_pred = models[1].predict(X)

cm = confusion_matrix(y, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()



In [None]:


recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
roc_auc = ras(y, y_pred)
print(f'Recall: {recall}\nf1 score: {f1}\nRoc_auc: {roc_auc}')

