In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
import pandas as pd

# Load the dataset
dataset_path = '../../data/creditcard.csv'

# Add validation
if not os.path.exists(dataset_path):
    raise FileNotFoundError(
        f"Dataset not found at {dataset_path}. Please run download_data.py first."
    )
    
df = pd.read_csv(dataset_path)
print(f"Successfully loaded dataset with {len(df)} rows")

In [None]:
# Overview of the dataset
print("Dataset Info:")
df.info()
print("\nFirst 5 Rows:")
print(df.head())

# Checking for missing values
print("\nMissing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Summary statistics of numerical features
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# Plotting the class distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=df)
plt.title('Class Distribution')
plt.xlabel('Class (0: Not Fraud, 1: Fraud)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation heatmap of the features
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap of Features')
plt.show()

In [None]:
# Transaction amount distribution
plt.figure(figsize=(8, 5))
sns.histplot(df['Amount'], bins=50, kde=True)
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

# Boxplot of transaction amounts to detect outliers
plt.figure(figsize=(8, 5))
sns.boxplot(x='Class', y='Amount', data=df)
plt.title('Boxplot of Transaction Amount by Class')
plt.xlabel('Class (0: Not Fraud, 1: Fraud)')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
# Time vs. Amount scatter plot to detect potential patterns
plt.figure(figsize=(10, 6))
plt.scatter(df['Time'], df['Amount'], alpha=0.5)
plt.title('Transaction Time vs Amount')
plt.xlabel('Time (seconds since first transaction)')
plt.ylabel('Transaction Amount')
plt.show()