# Exploratory Data Analysis - Credit Card Fraud Data

This notebook performs EDA on the creditcard.csv dataset to understand:
- Data structure and quality
- PCA-transformed feature distributions (V1-V28)
- Class imbalance analysis
- Time and amount patterns

**Author**: Adey Innovations Inc. Data Science Team  
**Date**: December 2025


## 1. Setup and Data Loading


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.4f}'.format)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Add parent directory to path for imports
import sys
sys.path.append('..')
from src.data_loader import load_creditcard_data, get_class_distribution
from src.visualization import plot_class_distribution

print("Libraries imported successfully!")


In [None]:
# Load the credit card dataset
cc_df = load_creditcard_data('../data/raw/creditcard.csv')
print(f"Dataset loaded successfully!")
print(f"Shape: {cc_df.shape}")


## 2. Data Inspection


In [None]:
# Display first rows
print("First 5 rows of the dataset:")
cc_df.head()


In [None]:
# Data info
print("\nDataset Info:")
print("="*50)
cc_df.info()


In [None]:
# Statistical summary
print("\nStatistical Summary:")
cc_df.describe()


In [None]:
# Check for missing values and duplicates
print("Missing Values:")
print(cc_df.isnull().sum().sum())

print(f"\nDuplicate rows: {cc_df.duplicated().sum()}")


## 3. Class Distribution Analysis


In [None]:
# Analyze class distribution
class_dist = get_class_distribution(cc_df, target_col='Class')

print("Class Distribution:")
print("="*50)
print(f"Legitimate transactions (0): {class_dist['counts'][0]:,} ({class_dist['percentages'][0]:.4f}%)")
print(f"Fraudulent transactions (1): {class_dist['counts'][1]:,} ({class_dist['percentages'][1]:.4f}%)")
print(f"\nImbalance Ratio: {class_dist['imbalance_ratio']:.1f}:1")
print(f"\n⚠️ EXTREME IMBALANCE: Only {class_dist['percentages'][1]:.3f}% are fraud cases!")


In [None]:
# Visualize class distribution
fig = plot_class_distribution(cc_df['Class'], title='Credit Card Fraud - Class Distribution')
plt.show()


## 4. Feature Analysis

### 4.1 Time and Amount Features


In [None]:
# Analyze Time feature
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Time distribution
axes[0, 0].hist(cc_df['Time'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Time (seconds)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Time')

# Time by class
for label, color in [(0, '#2ecc71'), (1, '#e74c3c')]:
    subset = cc_df[cc_df['Class'] == label]['Time']
    axes[0, 1].hist(subset, bins=50, alpha=0.6, label=f'Class {label}', color=color, density=True)
axes[0, 1].set_xlabel('Time (seconds)')
axes[0, 1].set_ylabel('Density')
axes[0, 1].set_title('Time Distribution by Class')
axes[0, 1].legend()

# Amount distribution (log scale)
axes[1, 0].hist(cc_df['Amount'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Amount ($)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Amount')
axes[1, 0].set_yscale('log')

# Amount by class (log scale)
cc_df.boxplot(column='Amount', by='Class', ax=axes[1, 1])
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('Amount ($)')
axes[1, 1].set_title('Amount by Class')
plt.suptitle('')

plt.tight_layout()
plt.show()

# Statistics
print("Amount Statistics by Class:")
print(cc_df.groupby('Class')['Amount'].describe())


### 4.2 PCA Features (V1-V28)


In [None]:
# Analyze PCA features - top features that differ most between classes
v_cols = [f'V{i}' for i in range(1, 29)]

# Calculate mean difference between classes
fraud_mean = cc_df[cc_df['Class'] == 1][v_cols].mean()
legit_mean = cc_df[cc_df['Class'] == 0][v_cols].mean()
mean_diff = (fraud_mean - legit_mean).abs().sort_values(ascending=False)

print("Top 10 V-features with highest class separation:")
print(mean_diff.head(10))


In [None]:
# Visualize top 6 discriminating features
top_features = mean_diff.head(6).index.tolist()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_features):
    for label, color in [(0, '#2ecc71'), (1, '#e74c3c')]:
        subset = cc_df[cc_df['Class'] == label][feature]
        axes[idx].hist(subset, bins=50, alpha=0.6, label=f'Class {label}', color=color, density=True)
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Density')
    axes[idx].set_title(f'Distribution of {feature}')
    axes[idx].legend()

plt.tight_layout()
plt.show()


## 5. Correlation Analysis


In [None]:
# Correlation with target
corr_with_target = cc_df.corr()['Class'].drop('Class').sort_values(key=abs, ascending=False)

print("Top 15 Features Correlated with Fraud (Class):")
print("="*50)
print(corr_with_target.head(15))

# Visualize
fig, ax = plt.subplots(figsize=(10, 8))
top_corr = corr_with_target.head(15)
colors = ['#e74c3c' if x < 0 else '#2ecc71' for x in top_corr.values]
ax.barh(top_corr.index, top_corr.values, color=colors)
ax.set_xlabel('Correlation with Class')
ax.set_title('Top 15 Features Correlated with Fraud')
ax.axvline(0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()


## 6. Key Findings Summary


In [None]:
print("="*70)
print("EDA KEY FINDINGS - CREDIT CARD FRAUD DATA")
print("="*70)

print(f"""
1. EXTREME CLASS IMBALANCE:
   - Fraud rate: {cc_df['Class'].mean()*100:.4f}%
   - Imbalance ratio: ~{class_dist['imbalance_ratio']:.0f}:1
   - This requires aggressive resampling techniques (SMOTE)

2. PCA FEATURES:
   - V1-V28 are PCA-transformed (original features hidden for privacy)
   - Several V-features show strong discrimination between classes
   - Top discriminating features: V14, V12, V10, V16, V17

3. AMOUNT ANALYSIS:
   - Fraud transactions tend to have lower median amounts
   - High variability in both classes
   - Amount may need normalization

4. TIME PATTERNS:
   - Time represents seconds since first transaction
   - Both classes distributed across time
   - No strong time-based pattern visible

5. DATA QUALITY:
   - No missing values
   - Some duplicate rows (may be legitimate repeat transactions)
   - Clean dataset ready for modeling

MODELING RECOMMENDATIONS:
   - Use SMOTE or class weights to handle imbalance
   - Focus on precision-recall metrics, not accuracy
   - V14, V12, V10 are likely important features
   - Consider normalizing Amount feature
""")

print("="*70)
