In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

visualization styles

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully.")

Load the dataset

In [None]:
try:
    # Make sure the path points to your raw data file
    df = pd.read_csv('../data/raw/PS_20174392719_1491204439457_log.csv')
    print("Dataset loaded successfully.")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: Dataset file not found. Make sure it's in the 'ml/data/raw/' directory.")

Initial Data Overview

In [None]:
print("First 5 rows of the dataset:")
display(df.head())

# Display data types and non-null counts
print("\nDataset Info:")
df.info()

# Display summary statistics for numerical columns
print("\nSummary Statistics:")
display(df.describe())


Check for missing values in each column

In [None]:
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

if missing_values.sum() == 0:
    print("\nGreat! No missing values found in the dataset.")
else:
    print("\nWarning: Missing values detected. These will need to be handled.")

Analyze Class Distribution (Fraud vs. Non-Fraud)

In [None]:
fraud_counts = df['isFraud'].value_counts()
fraud_percentage = df['isFraud'].value_counts(normalize=True) * 100

print("Fraud vs. Non-Fraud Distribution:")
print(fraud_counts)
print(f"\nPercentage of Fraudulent Transactions: {fraud_percentage[1]:.4f}%")

# Visualize the distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='isFraud', data=df)
plt.title('Distribution of Fraudulent vs. Non-Fraudulent Transactions')
plt.xlabel('Is Fraud (0: No, 1: Yes)')
plt.ylabel('Number of Transactions')
plt.show()


Explore Transaction Types 

In [None]:
print("Transaction Types Distribution:")
print(df['type'].value_counts())

# Visualize transaction types
plt.figure(figsize=(10, 6))
sns.countplot(x='type', data=df, order=df['type'].value_counts().index)
plt.title('Distribution of Transaction Types')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.show()

# Analyze fraud by transaction type
fraud_by_type = df.groupby('type')['isFraud'].sum()
print("\nNumber of Fraudulent Transactions by Type:")
print(fraud_by_type)

Explore Numerical Features

In [None]:
# Analyze the 'amount' column
plt.figure(figsize=(14, 6))

# Distribution of transaction amounts
plt.subplot(1, 2, 1)
sns.histplot(df['amount'], bins=50, kde=True)
plt.title('Distribution of Transaction Amount')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.yscale('log') # Use log scale due to high skew

# Boxplot of transaction amounts for fraud vs. non-fraud
plt.subplot(1, 2, 2)
sns.boxplot(x='isFraud', y='amount', data=df)
plt.title('Transaction Amount vs. Fraud')
plt.xlabel('Is Fraud')
plt.ylabel('Amount')
plt.yscale('log')

plt.tight_layout()
plt.show()

print("Observations on 'amount':")
print(df[['amount', 'isFraud']].groupby('isFraud').describe())



 Correlation Analysis

In [None]:
# Select only numerical columns for correlation matrix
numerical_df = df.select_dtypes(include=np.number)

# Calculate the correlation matrix
corr_matrix = numerical_df.corr()

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=.5)
plt.title('Correlation Matrix of Numerical Features')
plt.show()
