In [None]:
# 導入必要的庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 設置視覺化樣式
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

# 顯示設置
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

print("✅ 所有庫已成功導入！")


In [None]:
# 載入數據
print("🔄 載入交易數據...")
train_transaction = pd.read_csv('train_transaction.csv')
print(f"✅ 交易數據載入完成: {train_transaction.shape}")

print("\n🔄 載入身份數據...")
train_identity = pd.read_csv('train_identity.csv')
print(f"✅ 身份數據載入完成: {train_identity.shape}")

# 合併數據
print("\n🔄 合併數據...")
df = train_transaction.merge(train_identity, on='TransactionID', how='left')
print(f"✅ 合併完成: {df.shape}")

# 基本統計信息
fraud_count = df['isFraud'].sum()
total_count = len(df)
fraud_rate = fraud_count / total_count

print(f"\n🎯 目標變數分析:")
print(f"總交易數: {total_count:,}")
print(f"詐騙交易: {fraud_count:,}")
print(f"正常交易: {total_count - fraud_count:,}")
print(f"詐騙比例: {fraud_rate:.4f} ({fraud_rate*100:.2f}%)")

# 顯示前幾行數據
print(f"\n📋 數據預覽:")
df.head()


In [None]:
# 視覺化目標變數分佈
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# 長條圖
df['isFraud'].value_counts().plot(kind='bar', ax=ax1, color=['skyblue', 'orange'])
ax1.set_title('詐騙 vs 正常交易分佈', fontsize=14, fontweight='bold')
ax1.set_xlabel('交易類型')
ax1.set_ylabel('交易數量')
ax1.set_xticklabels(['正常交易', '詐騙交易'], rotation=0)

# 圓餅圖
labels = ['正常交易', '詐騙交易']
sizes = [total_count - fraud_count, fraud_count]
colors = ['skyblue', 'orange']
ax2.pie(sizes, labels=labels, colors=colors, autopct='%1.2f%%', startangle=90)
ax2.set_title('交易類型比例', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"💡 觀察: 這是一個高度不平衡的數據集，詐騙交易只佔 {fraud_rate*100:.2f}%")


In [None]:
# 缺失值分析
print("🔍 缺失值分析:")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_table = pd.DataFrame({
    '缺失數量': missing_data,
    '缺失百分比': missing_percent
})
missing_table = missing_table[missing_table['缺失數量'] > 0].sort_values('缺失數量', ascending=False)

print(f"總特徵數: {df.shape[1]}")
print(f"有缺失值的特徵數: {len(missing_table)}")
print(f"完全缺失的特徵數: {len(missing_table[missing_table['缺失百分比'] == 100])}")

# 顯示缺失最嚴重的前20個特徵
print(f"\n📊 缺失值最多的前20個特徵:")
missing_table.head(20)


In [None]:
# 交易金額分析
print("💰 交易金額 (TransactionAmt) 分析:")
print(f"最小金額: ${df['TransactionAmt'].min():.2f}")
print(f"最大金額: ${df['TransactionAmt'].max():,.2f}")
print(f"平均金額: ${df['TransactionAmt'].mean():.2f}")
print(f"中位數金額: ${df['TransactionAmt'].median():.2f}")

# 按詐騙類型分析交易金額
print(f"\n📊 按詐騙類型的交易金額分析:")
fraud_amount_stats = df.groupby('isFraud')['TransactionAmt'].agg(['count', 'mean', 'median', 'std'])
fraud_amount_stats.index = ['正常交易', '詐騙交易']
fraud_amount_stats.columns = ['交易數量', '平均金額', '中位數金額', '標準差']
fraud_amount_stats


In [None]:
# 導入必要的庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# 設置視覺化樣式
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

# 顯示設置
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

print("✅ 所有庫已成功導入！")


In [None]:
# 載入訓練數據
print("🔄 載入交易數據...")
train_transaction = pd.read_csv('train_transaction.csv')
print(f"✅ 交易數據載入完成: {train_transaction.shape}")

print("\n🔄 載入身份數據...")
train_identity = pd.read_csv('train_identity.csv')
print(f"✅ 身份數據載入完成: {train_identity.shape}")

# 顯示基本信息
print(f"\n📊 數據概覽:")
print(f"交易數據: {train_transaction.shape[0]:,} 筆交易, {train_transaction.shape[1]} 個特徵")
print(f"身份數據: {train_identity.shape[0]:,} 筆記錄, {train_identity.shape[1]} 個特徵")


In [None]:
# 合併交易和身份數據
print("🔄 合併數據...")
train_df = train_transaction.merge(train_identity, on='TransactionID', how='left')
print(f"✅ 合併完成: {train_df.shape}")

# 檢查目標變數
fraud_count = train_df['isFraud'].sum()
total_count = len(train_df)
fraud_rate = fraud_count / total_count

print(f"\n🎯 目標變數分析:")
print(f"總交易數: {total_count:,}")
print(f"詐騙交易: {fraud_count:,}")
print(f"正常交易: {total_count - fraud_count:,}")
print(f"詐騙比例: {fraud_rate:.4f} ({fraud_rate*100:.2f}%)")

# 顯示前幾行數據
print(f"\n📋 數據預覽:")
display(train_df.head())


In [None]:
# 視覺化目標變數分佈
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# 長條圖
train_df['isFraud'].value_counts().plot(kind='bar', ax=ax1, color=['skyblue', 'orange'])
ax1.set_title('詐騙 vs 正常交易分佈', fontsize=14, fontweight='bold')
ax1.set_xlabel('交易類型')
ax1.set_ylabel('交易數量')
ax1.set_xticklabels(['正常交易', '詐騙交易'], rotation=0)

# 圓餅圖
labels = ['正常交易', '詐騙交易']
sizes = [total_count - fraud_count, fraud_count]
colors = ['skyblue', 'orange']
ax2.pie(sizes, labels=labels, colors=colors, autopct='%1.2f%%', startangle=90)
ax2.set_title('交易類型比例', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"💡 觀察: 這是一個高度不平衡的數據集，詐騙交易只佔 {fraud_rate*100:.2f}%")


In [None]:
# 缺失值分析
print("🔍 缺失值分析:")
missing_data = train_df.isnull().sum()
missing_percent = (missing_data / len(train_df)) * 100
missing_table = pd.DataFrame({
    '缺失數量': missing_data,
    '缺失百分比': missing_percent
})
missing_table = missing_table[missing_table['缺失數量'] > 0].sort_values('缺失數量', ascending=False)

print(f"總特徵數: {train_df.shape[1]}")
print(f"有缺失值的特徵數: {len(missing_table)}")
print(f"完全缺失的特徵數: {len(missing_table[missing_table['缺失百分比'] == 100])}")

# 顯示缺失最嚴重的前20個特徵
print(f"\n📊 缺失值最多的前20個特徵:")
display(missing_table.head(20))
