# ðŸ“Š IEEE-CIS Fraud Detection â€” EDA
**Kaggle Notebook** Â· Dataset: `muhakabartay/yourallmodelsdata`

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from datetime import datetime
import os, gc, warnings
warnings.filterwarnings('ignore')

# Matplotlib style â€” fall back gracefully
try:
    plt.style.use('seaborn-v0_8-whitegrid')
except OSError:
    plt.style.use('seaborn-whitegrid')
sns.set_palette('husl')
plt.rcParams.update({'figure.figsize': [14, 6], 'figure.dpi': 110})

# â”€â”€ AUTO-DETECT DATA PATH â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
CANDIDATES = [
    '/kaggle/input/yourallmodelsdata',
    '/kaggle/input/datasets/muhakabartay/yourallmodelsdata',
]
INPUT = None
for p in CANDIDATES:
    if os.path.isdir(p):
        INPUT = p
        break
if INPUT is None:
    # Last resort: scan /kaggle/input for a directory containing the CSV
    for d in os.listdir('/kaggle/input'):
        full = f'/kaggle/input/{d}'
        if os.path.isdir(full) and os.path.exists(f'{full}/train_transaction.csv'):
            INPUT = full
            break
assert INPUT is not None, (
    'Could not find train_transaction.csv. '
    'Make sure the IEEE-CIS dataset is added via Add Data.'
)
OUT = '/kaggle/working'

print(f'Data path: {INPUT}')
print('Available files:')
for f in sorted(os.listdir(INPUT)):
    fp = f'{INPUT}/{f}'
    if os.path.isfile(fp):
        sz = os.path.getsize(fp) / 1e6
        print(f'  {f:40s} {sz:>8.1f} MB')
    else:
        print(f'  {f:40s}      [dir]')

---
## 1 Â· Load Data

In [None]:
%%time
train_txn = pd.read_csv(f'{INPUT}/train_transaction.csv')
train_id  = pd.read_csv(f'{INPUT}/train_identity.csv')
df = train_txn.merge(train_id, on='TransactionID', how='left')
del train_txn, train_id; gc.collect()

print(f'Shape       : {df.shape}')
print(f'Fraud rate  : {df.isFraud.mean():.4%}')
print(f'Memory      : {df.memory_usage(deep=True).sum()/1e9:.2f} GB')

In [None]:
df.head()

In [None]:
df.describe()

---
## 2 Â· Target Distribution

In [None]:
fc = df.isFraud.value_counts().sort_index()  # 0 first, 1 second
fr = df.isFraud.mean()
C = ['#2ecc71', '#e74c3c']

fig, ax = plt.subplots(1, 3, figsize=(18, 5))

ax[0].bar(['Legit', 'Fraud'], [fc[0], fc[1]], color=C)
for i, v in enumerate([fc[0], fc[1]]):
    ax[0].text(i, v + 800, f'{v:,}', ha='center', fontweight='bold')
ax[0].set_title('Transaction Counts', fontweight='bold')

ax[1].pie([1-fr, fr], labels=['Legit', 'Fraud'],
          autopct='%1.2f%%', colors=C, explode=[0, 0.1], shadow=True)
ax[1].set_title('Class Distribution', fontweight='bold')

for l, c in zip([0, 1], C):
    ax[2].hist(np.log1p(df.loc[df.isFraud == l, 'TransactionAmt']),
               bins=80, alpha=0.65, color=c, density=True,
               label='Fraud' if l else 'Legit')
ax[2].set_title('Log(Amount) by Class', fontweight='bold')
ax[2].legend()

plt.tight_layout()
plt.savefig(f'{OUT}/eda_overview.png', dpi=200, bbox_inches='tight')
plt.show()
print(f'Imbalance ratio: 1:{int(fc[0] / fc[1])}')

---
## 3 Â· Amount Statistics

In [None]:
print('Amount Stats by Class:')
print(df.groupby('isFraud')['TransactionAmt'].describe().round(2).to_string())

---
## 4 Â· Temporal Patterns

In [None]:
ref = datetime(2017, 12, 1)
df['_h'] = pd.to_datetime(df['TransactionDT'], unit='s', origin=ref).dt.hour
df['_d'] = pd.to_datetime(df['TransactionDT'], unit='s', origin=ref).dt.dayofweek

fig, ax = plt.subplots(1, 2, figsize=(16, 5))

hf = df.groupby('_h')['isFraud'].mean()
ax[0].bar(hf.index, hf.values, color='#3498db')
ax[0].axhline(fr, color='red', ls='--', label=f'Overall {fr:.2%}')
ax[0].set_title('Fraud Rate by Hour', fontweight='bold')
ax[0].set_xlabel('Hour of Day')
ax[0].legend()

dw = df.groupby('_d')['isFraud'].mean()
ax[1].bar(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], dw.values, color='#9b59b6')
ax[1].axhline(fr, color='red', ls='--')
ax[1].set_title('Fraud Rate by Day', fontweight='bold')

plt.tight_layout()
plt.savefig(f'{OUT}/temporal.png', dpi=200, bbox_inches='tight')
plt.show()
df.drop(columns=['_h', '_d'], inplace=True)

---
## 5 Â· Categorical Features

In [None]:
for col in ['ProductCD', 'card4', 'card6', 'DeviceType']:
    if col in df.columns:
        print(f'\n{col} fraud rates:')
        print(df.groupby(col)['isFraud'].mean().sort_values(ascending=False).to_string())

---
## 6 Â· Missing Values

In [None]:
miss = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
miss = miss[miss > 0]
print(f'Columns with missing: {len(miss)} / {df.shape[1]}')
print('\nTop 15:')
print(miss.head(15).to_string())

---
## 7 Â· V-Columns Correlation

In [None]:
v_cols = [c for c in df.columns if c.startswith('V')]
if v_cols:
    corr = df[v_cols + ['isFraud']].corr(numeric_only=True)['isFraud'].drop('isFraud').abs()
    corr = corr.sort_values(ascending=False)
    print(f'V-columns: {len(v_cols)}\n\nTop 10 correlated with fraud:')
    print(corr.head(10).to_string())
else:
    print('No V-columns found.')
    corr = pd.Series(dtype=float)

---
## 8 Â· Key Insights

In [None]:
print('=' * 60)
print('  KEY INSIGHTS')
print('=' * 60)
print(f'Dataset : {len(df):,} txns Ã— {df.shape[1]} features')
print(f'Fraud   : {fr:.2%}  (ratio 1:{int(fc[0]/fc[1])})')
print(f'Amount  : Legit mean=${df.loc[df.isFraud==0,"TransactionAmt"].mean():.0f}')
print(f'          Fraud mean=${df.loc[df.isFraud==1,"TransactionAmt"].mean():.0f}')
print(f'Missing : {len(miss)} columns have nulls')
if len(corr) > 0:
    print(f'V-cols  : {len(v_cols)} (top: {corr.index[0]}={corr.iloc[0]:.3f})')
print('=' * 60)