# Task 1: Exploratory Data Analysis - Fraud Data
This notebook explores the e-commerce fraud dataset, focusing on univariate and bivariate analysis, geolocation insights, and class imbalance.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append(os.path.abspath('..'))
from src.data_preprocessing import load_data, clean_data, feature_engineer_fraud
from src.utils import save_stats, save_plot

ImportError: cannot import name 'save_stats' from 'src.utils' (/home/karanos/kiam/week5-6/prod/notebooks/../src/utils.py)

In [None]:
fraud_path = '../data/raw/Fraud_Data.csv'
ip_path = '../data/raw/IpAddress_to_Country.csv'
credit_path = '../data/raw/creditcard.csv'

fraud_df, ip_df, credit_df = load_data(fraud_path, ip_path, credit_path)
print(f"Initial data shape: {fraud_df.shape}")

## 1. Data Cleaning
We handle missing values and duplicates.

In [None]:
fraud_df = clean_data(fraud_df)
print(f"Cleaned data shape: {fraud_df.shape}")

## 2. Univariate Analysis
Analyzing the distribution of key variables.

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='class', data=fraud_df, palette='viridis')
plt.title('Fraud Class Distribution')
plt.show()
print(fraud_df['class'].value_counts(normalize=True))

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(fraud_df['age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

## 3. Geolocation Analysis
Mapping IP addresses to countries and analyzing fraud rates.

In [None]:
fraud_df = feature_engineer_fraud(fraud_df, ip_df)
country_fraud = fraud_df.groupby('country')['class'].mean().sort_values(ascending=False).head(10)
print("Top 10 Fraudulent Countries (by rate):")
print(country_fraud)

In [None]:
plt.figure(figsize=(12, 6))
country_fraud.plot(kind='bar')
plt.title('Top 10 Countries by Fraud Rate')
plt.ylabel('Fraud Rate')
plt.show()

## 4. Bivariate Analysis
Relationship between features and fraud.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='class', y='purchase_value', data=fraud_df)
plt.title('Purchase Value by Class')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='class', y='time_since_signup', data=fraud_df)
plt.title('Time Since Signup by Class')
plt.show()