# Online Payments Fraud Detection using Machine Learning

This notebook implements a complete fraud detection system using multiple ML algorithms.

## 1. Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix
import warnings
import pickle

warnings.filterwarnings('ignore')
plt.style.use('ggplot')

## 2. Loading the Dataset

In [None]:
df = pd.read_csv('../data/PS_20174392719_1491204439457_log.csv')
print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")

In [None]:
df.columns

## 3. Data Preprocessing

In [None]:
df = df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)
print("Unnecessary columns dropped")
print(f"New shape: {df.shape}")

In [None]:
df.head()

In [None]:
df.tail()

## 4. Checking for Null Values

In [None]:
df.isnull().sum()

In [None]:
df.info()

## 5. Descriptive Analysis

In [None]:
df.describe()

## 6. Correlation Analysis

In [None]:
correlation = df.corr()
correlation

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

## 7. Univariate Analysis

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['step'], kde=True, bins=30)
plt.title('Distribution of Step')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['step'])
plt.title('Boxplot of Step')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x=df['type'])
plt.title('Count of Transaction Types')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['amount'], kde=True, bins=50)
plt.title('Distribution of Amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['amount'])
plt.title('Boxplot of Amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['oldbalanceOrg'], kde=True, bins=50)
plt.title('Distribution of Old Balance Origin')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['oldbalanceDest'])
plt.title('Boxplot of Old Balance Destination')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['newbalanceDest'])
plt.title('Boxplot of New Balance Destination')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x=df['isFraud'])
plt.title('Count of Fraud vs Non-Fraud Transactions')
plt.xlabel('Is Fraud (0=No, 1=Yes)')
plt.show()

In [None]:
df['isFraud'].value_counts()

In [None]:
df.loc[df['isFraud'] == 0, 'fraud_label'] = 'Not Fraud'
df.loc[df['isFraud'] == 1, 'fraud_label'] = 'Is Fraud'
df['fraud_label'].value_counts()

## 8. Bivariate Analysis

In [None]:
plt.figure(figsize=(10, 6))
sns.jointplot(data=df, x='newbalanceDest', y='isFraud', hue='isFraud')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='type', hue='isFraud')
plt.title('Transaction Type vs Fraud')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='isFraud', y='step')
plt.title('Step vs Fraud')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='isFraud', y='amount')
plt.title('Amount vs Fraud')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='isFraud', y='oldbalanceOrg')
plt.title('Old Balance Origin vs Fraud')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='isFraud', y='newbalanceOrig')
plt.title('New Balance Origin vs Fraud')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='isFraud', y='oldbalanceDest')
plt.title('Old Balance Destination vs Fraud')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='isFraud', y='newbalanceDest')
plt.title('New Balance Destination vs Fraud')
plt.show()

## 9. Handling Outliers

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['amount'])
plt.title('Outliers in Amount')
plt.show()

In [None]:
df['amount_log'] = np.log1p(df['amount'])
plt.figure(figsize=(10, 6))
sns.histplot(df['amount_log'], kde=True)
plt.title('Log Transformation of Amount')
plt.show()

## 10. Label Encoding

In [None]:
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])
print("Label Encoding completed")
print(f"Encoded values: {le.classes_}")
df.head()

## 11. Train-Test Split

In [None]:
X = df.drop(['isFraud', 'fraud_label', 'amount_log'], axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

## 12. Model Building

### 12.1 Random Forest Classifier

In [None]:
def RandomForest(X_train, y_train, X_test, y_test):
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print("Random Forest Classifier Results:")
    print("="*50)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return rf, accuracy_score(y_test, y_pred)

rf_model, rf_acc = RandomForest(X_train, y_train, X_test, y_test)

### 12.2 Decision Tree Classifier

In [None]:
def Decisiontree(X_train, y_train, X_test, y_test):
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    
    print("Decision Tree Classifier Results:")
    print("="*50)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return dt, accuracy_score(y_test, y_pred)

dt_model, dt_acc = Decisiontree(X_train, y_train, X_test, y_test)

### 12.3 Extra Trees Classifier

In [None]:
def ExtraTree(X_train, y_train, X_test, y_test):
    et = ExtraTreesClassifier(n_estimators=100, random_state=42)
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    
    print("Extra Trees Classifier Results:")
    print("="*50)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return et, accuracy_score(y_test, y_pred)

et_model, et_acc = ExtraTree(X_train, y_train, X_test, y_test)

### 12.4 Support Vector Classifier

In [None]:
def SupportVector(X_train, y_train, X_test, y_test):
    svc = SVC(kernel='linear', random_state=42)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    
    print("Support Vector Classifier Results:")
    print("="*50)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return svc, accuracy_score(y_test, y_pred)

svc_model, svc_acc = SupportVector(X_train, y_train, X_test, y_test)

### 12.5 XGBoost Classifier

In [None]:
def xgboost_classifier(X_train, y_train, X_test, y_test):
    xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    
    print("XGBoost Classifier Results:")
    print("="*50)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return xgb_model, accuracy_score(y_test, y_pred)

xgb_model, xgb_acc = xgboost_classifier(X_train, y_train, X_test, y_test)

## 13. Model Comparison

In [None]:
def compareModel():
    models = {
        'Random Forest': rf_acc,
        'Decision Tree': dt_acc,
        'Extra Trees': et_acc,
        'SVC': svc_acc,
        'XGBoost': xgb_acc
    }
    
    print("\n" + "="*60)
    print("MODEL COMPARISON")
    print("="*60)
    
    for model_name, accuracy in models.items():
        print(f"{model_name:20s}: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    best_model = max(models, key=models.get)
    print("\n" + "="*60)
    print(f"Best Model: {best_model} with accuracy {models[best_model]:.4f}")
    print("="*60)
    
    plt.figure(figsize=(12, 6))
    plt.bar(models.keys(), models.values(), color=['blue', 'green', 'orange', 'red', 'purple'])
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title('Model Accuracy Comparison')
    plt.xticks(rotation=45)
    plt.ylim([0, 1])
    for i, (model, acc) in enumerate(models.items()):
        plt.text(i, acc + 0.01, f'{acc:.4f}', ha='center')
    plt.tight_layout()
    plt.show()
    
    return best_model

best_model_name = compareModel()

## 14. Saving the Best Model

In [None]:
with open('../flask/payments.pkl', 'wb') as file:
    pickle.dump(svc_model, file)

print("Model saved successfully as 'payments.pkl'")
print(f"Model type: {type(svc_model)}")
print(f"Model saved at: ../flask/payments.pkl")

## 15. Testing the Saved Model

In [None]:
with open('../flask/payments.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

sample_input = X_test.iloc[0:1]
prediction = loaded_model.predict(sample_input)

print("Model loaded and tested successfully!")
print(f"Sample prediction: {prediction[0]}")
print(f"Actual value: {y_test.iloc[0]}")

## 16. Feature Importance

In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print(feature_importance)

## Conclusion

This notebook successfully implemented a fraud detection system with the following achievements:

1. Loaded and preprocessed the online payments dataset
2. Performed comprehensive EDA with univariate and bivariate analysis
3. Trained 5 different ML models
4. Compared model performances
5. Saved the best performing model for deployment

The model is now ready to be integrated into the Flask web application for real-time fraud detection.