In [None]:
# notebooks/02_fraud_data_eda.ipynb

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Optional settings
pd.set_option("display.max_columns", None)
sns.set(style="whitegrid")

# =====================
# 1. Load merged dataset
# =====================
df = pd.read_csv("D:/PYTHON PROJECTS/Dual-Channel-Fraud-Detection-System-for-E-Commerce-Banking/Data/merged_data.csv")

print("✅ Data loaded:", df.shape)
print(df.head())

# =====================
# 2. Basic info
# =====================
print("\nInfo:")
print(df.info())

print("\nMissing values:")
print(df.isnull().sum())

print("\nFraud distribution:")
print(df['class'].value_counts(normalize=True))

plt.figure(figsize=(6,4))
sns.countplot(x='class', data=df, palette='viridis')
plt.title("Fraud (1) vs Non-Fraud (0)")
plt.show()

# =====================
# 3. Fraud by transaction_transaction_transaction_transaction_transaction_transaction_country
# =====================
plt.figure(figsize=(14,6))
transaction_country_counts = df.groupby(['transaction_country', 'class']).size().reset_index(name='count')
sns.barplot(x='transaction_country', y='count', hue='class', data=transaction_country_counts)
plt.xticks(rotation=90)
plt.title("Fraud vs Non-Fraud by transaction_country")
plt.show()

# =====================
# 4. Fraud by purchase hour
# =====================
df['purchase_time'] = pd.to_datetime(df['purchase_time'])
df['purchase_hour'] = df['purchase_time'].dt.hour

plt.figure(figsize=(12,5))
sns.histplot(df, x='purchase_hour', hue='class', multiple='stack', bins=24)
plt.title("Fraud by Purchase Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Transaction Count")
plt.show()

# =====================
# 5. Fraud by signup-purchase gap
# =====================
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['time_diff'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600  # hours

plt.figure(figsize=(10,5))
sns.boxplot(x='class', y='time_diff', data=df)
plt.ylim(0, 48)  # zoom into first 2 days
plt.title("Time Gap (Signup → Purchase) by Class")
plt.show()

# =====================
# 6. Fraud by purchase amount
# =====================
plt.figure(figsize=(10,5))
sns.boxplot(x='class', y='purchase_value', data=df)
plt.ylim(0, 200)  # zoom into reasonable range
plt.title("Purchase Amount Distribution by Fraud Status")
plt.show()


: 