# EDA: Fraud_Data

Objectives:
- Load and clean Fraud_Data with IP-to-country mapping.
- Inspect class imbalance and key feature distributions.
- Enrich with geolocation and basic time/device features.
- Save a processed dataset for downstream modeling.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

RAW_PATH = 'data/raw/'
PROCESSED_PATH = 'data/processed/'
os.makedirs(PROCESSED_PATH, exist_ok=True)

## 1. Load data

In [2]:
fraud_df = pd.read_csv(RAW_PATH + 'Fraud_Data.csv', parse_dates=['signup_time', 'purchase_time'])
ip_country = pd.read_csv(RAW_PATH + 'IpAddress_to_Country.csv')

print('Fraud_Data shape:', fraud_df.shape)
print('IpAddress_to_Country shape:', ip_country.shape)
fraud_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/Fraud_Data.csv'

## 2. Clean data
- Missing values and duplicates
- Fix dtypes (ip_address, class)

In [None]:
print('Missing values in Fraud_Data:')
print(fraud_df.isnull().sum())

print('Duplicates:', fraud_df.duplicated().sum())
fraud_df = fraud_df.drop_duplicates()

fraud_df['ip_address'] = fraud_df['ip_address'].astype('int64')
fraud_df['class'] = fraud_df['class'].astype(int)

print('After cleaning shape:', fraud_df.shape)

## 3. Class imbalance

In [None]:
counts = fraud_df['class'].value_counts(normalize=True) * 100
plt.figure(figsize=(6, 5))
plt.bar(['Legitimate', 'Fraud'], counts.values, color=['skyblue', 'salmon'])
plt.title('Fraud_Data Class Distribution')
plt.ylabel('Percentage')
for i, v in enumerate(counts.values):
    plt.text(i, v + 0.5, f'{v:.2f}%', ha='center')
plt.show()

print(f