# Data Exploration - UPI Fraud Detection

This notebook explores the transaction dataset and visualizes fraud patterns.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Dataset

In [None]:
# Load data
df = pd.read_csv('../data/raw/upi_fraud_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## Basic Statistics

In [None]:
# Dataset info
print("Dataset Information:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nBasic Statistics:")
df.describe()

## Fraud Distribution

In [None]:
# Fraud distribution
fraud_counts = df['isFraud'].value_counts()
fraud_pct = df['isFraud'].value_counts(normalize=True) * 100

print("Fraud Distribution:")
print(f"Legitimate: {fraud_counts[0]:,} ({fraud_pct[0]:.2f}%)")
print(f"Fraud: {fraud_counts[1]:,} ({fraud_pct[1]:.2f}%)")

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
fraud_counts.plot(kind='bar', ax=ax1, color=['green', 'red'])
ax1.set_title('Transaction Count by Type')
ax1.set_xlabel('Transaction Type (0=Legitimate, 1=Fraud)')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['Legitimate', 'Fraud'], rotation=0)

# Pie chart
ax2.pie(fraud_counts, labels=['Legitimate', 'Fraud'], autopct='%1.2f%%', colors=['green', 'red'])
ax2.set_title('Fraud Distribution')

plt.tight_layout()
plt.show()

## Transaction Types

In [None]:
# Transaction types
print("Transaction Types:")
print(df['type'].value_counts())

# Fraud by transaction type
fraud_by_type = pd.crosstab(df['type'], df['isFraud'])
fraud_by_type['fraud_rate'] = fraud_by_type[1] / (fraud_by_type[0] + fraud_by_type[1]) * 100

print("\nFraud Rate by Transaction Type:")
print(fraud_by_type)

# Visualize
fraud_by_type['fraud_rate'].plot(kind='bar', color='red', figsize=(10, 5))
plt.title('Fraud Rate by Transaction Type')
plt.xlabel('Transaction Type')
plt.ylabel('Fraud Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Amount Analysis

In [None]:
# Amount statistics
print("Amount Statistics:")
print(f"Mean: {df['amount'].mean():.2f}")
print(f"Median: {df['amount'].median():.2f}")
print(f"Max: {df['amount'].max():.2f}")

# Compare fraud vs legitimate
print("\nAmount by Fraud Status:")
print(df.groupby('isFraud')['amount'].describe())

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
df.boxplot(column='amount', by='isFraud', ax=ax1)
ax1.set_title('Amount Distribution by Fraud Status')
ax1.set_xlabel('Fraud Status (0=Legitimate, 1=Fraud)')
ax1.set_ylabel('Amount')

# Histogram
df[df['isFraud']==0]['amount'].hist(bins=50, alpha=0.5, label='Legitimate', ax=ax2, color='green')
df[df['isFraud']==1]['amount'].hist(bins=50, alpha=0.5, label='Fraud', ax=ax2, color='red')
ax2.set_title('Amount Distribution')
ax2.set_xlabel('Amount')
ax2.set_ylabel('Frequency')
ax2.legend()

plt.tight_layout()
plt.show()

## Balance Analysis

In [None]:
# Balance changes
df['balance_change_orig'] = df['newbalanceOrig'] - df['oldbalanceOrg']
df['balance_change_dest'] = df['newbalanceDest'] - df['oldbalanceDest']

print("Balance Change Statistics:")
print(df.groupby('isFraud')[['balance_change_orig', 'balance_change_dest']].describe())

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Sender balance change
df.boxplot(column='balance_change_orig', by='isFraud', ax=ax1)
ax1.set_title('Sender Balance Change')
ax1.set_xlabel('Fraud Status')
ax1.set_ylabel('Balance Change')

# Receiver balance change
df.boxplot(column='balance_change_dest', by='isFraud', ax=ax2)
ax2.set_title('Receiver Balance Change')
ax2.set_xlabel('Fraud Status')
ax2.set_ylabel('Balance Change')

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Correlation matrix
corr = df[numeric_cols].corr()

# Visualize
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# Correlation with fraud
print("\nCorrelation with Fraud:")
print(corr['isFraud'].sort_values(ascending=False))

## Key Insights

1. **Extreme Imbalance**: Fraud cases are < 1% of total transactions
2. **Transaction Types**: Certain types have higher fraud rates
3. **Amount Patterns**: Fraudulent transactions show different amount distributions
4. **Balance Changes**: Unusual balance patterns indicate fraud

These insights guide our feature engineering and model selection.