In [None]:
import pandas as pd 
import numpy as np

pd.set_option('display.max_columns', None)

pd.set_option('display.max_rows', None)

In [None]:
identity = pd.read_csv('data/train_identity.csv')
transaction = pd.read_csv('data/train_transaction.csv')

In [None]:
identity.info()

In [None]:
transaction.info()

In [None]:
for col in transaction.columns:
    if transaction[col].dtype == 'object':
        print(f"{col}: {transaction[col].nunique()} unique values")
    else:
        print(f"{col}: {transaction[col].nunique()} unique values")

# Merge train and test set

In [None]:
train = transaction.merge(identity, on="TransactionID", how="left")

# Save merged train
train.to_csv("data/train_merged.csv", index=False)

# Load and merge test
test_identity = pd.read_csv("data/test_identity.csv")
test_transaction = pd.read_csv("data/test_transaction.csv")
test = test_transaction.merge(test_identity, on="TransactionID", how="left")

# Save merged test
test.to_csv("data/test_merged.csv", index=False)

print("Train merged shape:", train.shape)
print("Test merged shape:", test.shape)
print("Files saved as data/train_merged.csv and data/test_merged.csv")

# EDA 

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv("data/train_merged.csv") 

In [None]:
df.info() 

# Overview

In [None]:
df.head()

In [None]:
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"#{col}: {df[col].nunique()} unique values")
        print(f'sample values: {df[col].dropna().unique()[:5]}')
    else:
        print(f"#{col}: {df[col].nunique()} unique values")
        print(f'sample values: {df[col].dropna().unique()[:5]}') 

# 📊 Feature Explanation – IEEE-CIS Fraud Detection

## 1. Transaction Features
- **TransactionID**: Unique identifier for each transaction. Used to merge with the identity table, not directly useful for modeling.  
- **TransactionDT**: Time in seconds from a reference point (not a real timestamp).
- **TransactionAmt**: Transaction amount. Fraud often involves:  
  - Very small amounts (testing stolen cards).  
  - Very large amounts (cash-out quickly).  
- **ProductCD**: Product category (e.g., W, C, H, R, S).  

---

## 2. Card Features (card1–card6)
- **card1**: User/account identifier (anonymized).
- **card2**: Issuing bank (anonymized).  
- **card3**: Country of issuing bank.  
- **card4**: Card type (Visa, MasterCard, Amex, etc.).  
- **card5**: Card series number.  
- **card6**: Card category (credit/debit).    

---

## 3. Address Features
- **addr1**: Billing region.  
- **addr2**: Billing country.  

---

## 4. Email Features
- **P_emaildomain**: Purchaser’s email domain.  
- **R_emaildomain**: Recipient’s email domain.  

---

## 5. Counting Features (C1–C14)
- Pre-engineered **count statistics** (anonymized).  
- Example meaning: number of transactions linked to a card or user.  

---

## 6. Time Delta Features (D1–D15)
- Pre-engineered **time-related deltas**.  
- Examples:  
  - `D1` ≈ days since first transaction of user.  
  - `D10` ≈ days since last billing.  

---

## 7. Matching Features (M1–M9)
- Boolean flags (Yes/No/NaN).  
- Indicate whether information matches across sources (e.g., billing vs. shipping address, email vs. card info).  

---

## 8. Engineered Features (V1–V339)
- Large set of anonymized engineered features.  
- Likely derived from C/D/M variables via normalization, statistical transformations, or PCA.  


---

## 9. Identity Features
- **DeviceType**: Desktop or mobile.  
- **DeviceInfo**: OS/browser/device information.  
- **id_01–id_38**: Digital identity flags (proxy usage, cookies, authentication methods, risk scores).  

---

## ✅ Summary
- **Transaction-related features**: core info (time, amount, product).  
- **Card/Address/Email**: user identifiers, useful for grouping & mismatch detection.  
- **C, D, M features**: pre-computed stats on frequency, time deltas, matches.  
- **V features**: anonymized engineered variables.  
- **Identity features**: digital fingerprint of users.  

Together, these 400+ features provide rich signals for modeling fraud detection.

# Data quality check 

In [None]:
# data quality check
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
missing_data = missing_data[missing_data['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)
print(missing_data)

# EDA on each group feature

## Transaction Features 

### TransactionDT 

In [None]:


# Convert TransactionDT into days (approximate)
df["TransactionDay"] = df["TransactionDT"] // (24*60*60)
df["TransactionHour"] = (df["TransactionDT"] // 3600) % 24

# Plot transactions over time
plt.figure(figsize=(12,5))
df["TransactionDay"].plot(kind="hist", bins=50, title="Distribution of Transaction Days")
plt.show()

plt.figure(figsize=(12,5))
df["TransactionHour"].plot(kind="hist", bins=24, title="Distribution of Transaction Hours")
plt.show()


The histogram shows a clear cyclical pattern in transaction time. Instead of being uniformly distributed, transactions are concentrated in specific time ranges, creating two main peaks. This indicates that transactions tend to occur more frequently during certain periods, possibly reflecting daily user behavior (example: daytime vs nighttime activity). Such patterns could be relevant in detecting anomalies, since fraudulent transactions may not follow the same temporal distribution as legitimate ones.

### Amount

In [None]:
# plot amount distribution
plt.figure(figsize=(12,5))
df["TransactionAmt"].plot(kind="hist", bins=50, title="Distribution of Transaction Amounts")
plt.show()


The histogram shows that most transactions involve very small amounts, with the frequency rapidly decreasing as the transaction amount increases. This results in a highly skewed distribution with a long tail to the right. Only a small fraction of transactions involve large amounts, but these extreme values could be important when analyzing fraud, since abnormal or unusually high amounts might be associated with fraudulent behavior.

### Amount vs isFraud

In [None]:
# Transaction Features Analysis

# First, let's check the target variable distribution
print("Target Variable Distribution:")
print(df['isFraud'].value_counts())
print(f"Fraud Rate: {df['isFraud'].mean():.4f}")

plt.figure(figsize=(8,5))
df['isFraud'].value_counts().plot(kind='bar', title='Distribution of Fraud vs Non-Fraud')
plt.xticks([0,1], ['Non-Fraud', 'Fraud'], rotation=0)
plt.show()

# Transaction Amount vs Fraud
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
df[df['isFraud']==0]['TransactionAmt'].hist(bins=50, alpha=0.7, label='Non-Fraud', density=True)
df[df['isFraud']==1]['TransactionAmt'].hist(bins=50, alpha=0.7, label='Fraud', density=True)
plt.xlabel('Transaction Amount')
plt.ylabel('Density')
plt.legend()
plt.title('Transaction Amount Distribution by Fraud Status')
plt.yscale('log')

plt.subplot(1,2,2)
# Log transform for better visualization
df['TransactionAmt_log'] = np.log1p(df['TransactionAmt'])
df[df['isFraud']==0]['TransactionAmt_log'].hist(bins=50, alpha=0.7, label='Non-Fraud', density=True)
df[df['isFraud']==1]['TransactionAmt_log'].hist(bins=50, alpha=0.7, label='Fraud', density=True)
plt.xlabel('Log(Transaction Amount + 1)')
plt.ylabel('Density')
plt.legend()
plt.title('Log Transaction Amount Distribution by Fraud Status')

plt.tight_layout()
plt.show()

# Statistical summary by fraud status
print("\nTransaction Amount Statistics by Fraud Status:")
print(df.groupby('isFraud')['TransactionAmt'].describe())

In [None]:
import seaborn as sns
sns.histplot(data=df, x="TransactionAmt_log", hue="isFraud", log_scale=False, kde=True)


- Log transformation is effective for reducing skewness and making the data more suitable for modeling.

- Transaction amount alone is not a strong discriminator between Fraud and Non-Fraud, since their distributions are quite similar after transformation.

- Very high-value transactions are mostly Non-Fraud, which could serve as a minor signal.

- To detect fraud effectively, this feature should be combined with other behavioral or contextual features (time, frequency, location, sender/receiver, etc.).

### ProductCD

In [None]:
# ProductCD Analysis
plt.figure(figsize=(15,5))

plt.subplot(1,3,1)
df['ProductCD'].value_counts().plot(kind='bar', title='ProductCD Distribution')
plt.xticks(rotation=45)

plt.subplot(1,3,2)
fraud_by_product = df.groupby('ProductCD')['isFraud'].mean()
fraud_by_product.plot(kind='bar', title='Fraud Rate by ProductCD')
plt.ylabel('Fraud Rate')
plt.xticks(rotation=45)

plt.subplot(1,3,3)
product_fraud_counts = df.groupby(['ProductCD', 'isFraud']).size().unstack(fill_value=0)
product_fraud_counts.plot(kind='bar', stacked=True, title='Fraud Count by ProductCD')
plt.legend(['Non-Fraud', 'Fraud'])
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("\nFraud Rate by ProductCD:")
print(fraud_by_product.sort_values(ascending=False))

In [None]:
# Time-based Analysis
# Transaction patterns by hour
plt.figure(figsize=(15,5))

plt.subplot(1,3,1)
df['TransactionHour'].value_counts().sort_index().plot(kind='bar', title='Transactions by Hour')
plt.xlabel('Hour of Day')

plt.subplot(1,3,2)
hourly_fraud_rate = df.groupby('TransactionHour')['isFraud'].mean()
hourly_fraud_rate.plot(kind='bar', title='Fraud Rate by Hour')
plt.ylabel('Fraud Rate')
plt.xlabel('Hour of Day')

plt.subplot(1,3,3)
# Day of week analysis (approximate)
df['DayOfWeek'] = df['TransactionDay'] % 7
daily_fraud_rate = df.groupby('DayOfWeek')['isFraud'].mean()
daily_fraud_rate.plot(kind='bar', title='Fraud Rate by Day of Week')
plt.ylabel('Fraud Rate')
plt.xlabel('Day of Week (0=Monday)')

plt.tight_layout()
plt.show()

print("\nFraud Rate by Hour:")
print(hourly_fraud_rate.sort_values(ascending=False))

`1. Transactions by Hour (left)`

Transaction volume is lowest during the early morning (3 AM – 6 AM) and gradually increases during the day.

There is a peak in the evening (around 6 PM – 9 PM), when the number of transactions is the highest.

This pattern likely reflects typical human activity: fewer transactions overnight and more during active business and leisure hours.

`2. Fraud Rate by Hour (middle)`

The fraud rate is highest in the early morning (around 6–9 AM), peaking sharply around 7 AM at more than 10%.

After 10 AM, the fraud rate drops significantly and remains relatively stable throughout the day.

Interestingly, fraud tends to occur disproportionately when transaction volume is low, which may indicate fraudsters exploit periods of lower monitoring or lower user activity.

`3. Fraud Rate by Day of Week (right)`

Fraud rates are relatively consistent across the week, but:

Monday (0) and Wednesday (2) show the highest fraud rates (~3.7%).

Friday (4) shows the lowest fraud rate (~3.1%).

This suggests some weekly behavioral patterns, but the variation is not as strong as the hourly effect.

`4. Key Takeaways`

Fraudulent activity is time-sensitive: fraud risk is elevated during early-morning hours, despite fewer transactions overall.

Monitoring systems may need extra vigilance in off-peak hours, when fraudsters seem more active.

Fraud risk is slightly higher at the start and middle of the week, though differences across days are moderate.

Combining transaction timing (hour, day) with other features could improve fraud detection models.

In [None]:
# Deep analysis of Product, Hours, Amount vs Fraud
import seaborn as sns

plt.figure(figsize=(20, 15))

# 1. Fraud Rate by Product and Hour (Heatmap)
plt.subplot(3, 3, 1)
fraud_heatmap = df.groupby(['ProductCD', 'TransactionHour'])['isFraud'].mean().unstack(fill_value=0)
sns.heatmap(fraud_heatmap, annot=True, fmt='.3f', cmap='Reds', 
            cbar_kws={'label': 'Fraud Rate'})
plt.title('Fraud Rate: Product vs Hour')
plt.ylabel('Product')
plt.xlabel('Hour')

# 2. Transaction Count by Product and Hour
plt.subplot(3, 3, 2)
count_heatmap = df.groupby(['ProductCD', 'TransactionHour']).size().unstack(fill_value=0)
sns.heatmap(count_heatmap, annot=True, fmt='d', cmap='Blues',
            cbar_kws={'label': 'Transaction Count'})
plt.title('Transaction Count: Product vs Hour')
plt.ylabel('Product')
plt.xlabel('Hour')

# 3. Average Amount by Product and Hour
plt.subplot(3, 3, 3)
amount_heatmap = df.groupby(['ProductCD', 'TransactionHour'])['TransactionAmt'].mean().unstack(fill_value=0)
sns.heatmap(amount_heatmap, annot=True, fmt='.0f', cmap='Greens',
            cbar_kws={'label': 'Avg Amount'})
plt.title('Average Amount: Product vs Hour')
plt.ylabel('Product')
plt.xlabel('Hour')

# 4. Fraud Rate vs Transaction Amount by Product
plt.subplot(3, 3, 4)
for product in df['ProductCD'].unique():
    if pd.notna(product):
        product_data = df[df['ProductCD'] == product]
        amount_bins = pd.cut(product_data['TransactionAmt'], bins=10)
        fraud_by_amount = product_data.groupby(amount_bins)['isFraud'].mean()
        plt.plot(range(len(fraud_by_amount)), fraud_by_amount.values, 
                marker='o', label=f'Product {product}')
plt.title('Fraud Rate vs Amount by Product')
plt.xlabel('Amount Bins (Low to High)')
plt.ylabel('Fraud Rate')
plt.legend()

# 5. Fraud Rate vs Hour by Product
plt.subplot(3, 3, 5)
for product in df['ProductCD'].unique():
    if pd.notna(product):
        product_data = df[df['ProductCD'] == product]
        hourly_fraud = product_data.groupby('TransactionHour')['isFraud'].mean()
        plt.plot(hourly_fraud.index, hourly_fraud.values, 
                marker='o', label=f'Product {product}', linewidth=2)
plt.title('Fraud Rate by Hour for Each Product')
plt.xlabel('Hour')
plt.ylabel('Fraud Rate')
plt.legend()
plt.grid(True, alpha=0.3)

# 6. Box plot: Amount distribution by Product and Fraud
plt.subplot(3, 3, 6)
df_sample = df.sample(10000)  # Sample for better visualization
sns.boxplot(data=df_sample, x='ProductCD', y='TransactionAmt_log', hue='isFraud')
plt.title('Amount Distribution by Product and Fraud')
plt.ylabel('Log(Transaction Amount)')
plt.yscale('linear')

# 7. High-risk combinations (Product C + Hour 7)
plt.subplot(3, 3, 7)
risk_combinations = []
fraud_rates = []
labels = []

for product in df['ProductCD'].unique():
    if pd.notna(product):
        for hour in [6, 7, 8]:  # Focus on high-risk hours
            subset = df[(df['ProductCD'] == product) & (df['TransactionHour'] == hour)]
            if len(subset) > 10:  # Only if enough samples
                fraud_rate = subset['isFraud'].mean()
                risk_combinations.append(f'{product}-H{hour}')
                fraud_rates.append(fraud_rate)
                labels.append(f'Product {product}, Hour {hour}')

plt.bar(range(len(fraud_rates)), fraud_rates)
plt.xticks(range(len(fraud_rates)), risk_combinations, rotation=45)
plt.title('Fraud Rate for Product-Hour Combinations')
plt.ylabel('Fraud Rate')

# 8. Amount patterns for high-risk scenarios
plt.subplot(3, 3, 8)
high_risk = df[(df['ProductCD'] == 'C') & (df['TransactionHour'] == 7)]
normal_risk = df[(df['ProductCD'] != 'C') & (df['TransactionHour'] != 7)]

plt.hist([high_risk['TransactionAmt_log'], normal_risk['TransactionAmt_log']], 
         bins=30, alpha=0.7, density=True,
         label=['High Risk (C+H7)', 'Normal Risk'])
plt.title('Amount Distribution: High Risk vs Normal')
plt.xlabel('Log(Transaction Amount)')
plt.ylabel('Density')
plt.legend()

# 9. 3D visualization concept (simplified as scatter)
plt.subplot(3, 3, 9)
fraud_data = df[df['isFraud'] == 1].sample(min(1000, len(df[df['isFraud'] == 1])))
normal_data = df[df['isFraud'] == 0].sample(min(1000, len(df[df['isFraud'] == 0])))

plt.scatter(fraud_data['TransactionHour'], fraud_data['TransactionAmt_log'], 
           c='red', alpha=0.6, s=20, label='Fraud')
plt.scatter(normal_data['TransactionHour'], normal_data['TransactionAmt_log'], 
           c='blue', alpha=0.3, s=10, label='Normal')
plt.title('Hour vs Amount: Fraud vs Normal')
plt.xlabel('Transaction Hour')
plt.ylabel('Log(Transaction Amount)')
plt.legend()

plt.tight_layout()
plt.show()

## Card feature 

In [None]:
cols = [f'card{i}' for i in range(1, 7)]

for col in cols:
    if col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")
        print(f'sample values: {df[col].dropna().unique()[:5]}')
    else:
        print(f"{col} not in dataframe") 

### card1

In [None]:
card1_counts = df['card1'].value_counts()

plt.figure(figsize=(10,5))
card1_counts[:20].plot(kind='bar', color='steelblue')
plt.title("Top 20 card1 with Most Transactions")
plt.xlabel("card1 ID")
plt.ylabel("Number of Transactions")
plt.show()

After the top 2, the counts drop noticeably, showing a long-tail effect where most card IDs have far fewer transactions. 

7919 and 9500 stand out as extreme outliers with unusually high transaction activity. 

! These IDs could represent high-frequency users, automated transactions, or even anomalies worth deeper investigation (e.g., link to fraud ratio).
 
`Such concentration can bias models: if fraud is concentrated in these high-activity IDs, they will disproportionately influence training.`

`Important next step: check the fraud rate per card1, especially for IDs with extreme transaction volumes.`

In [None]:
import numpy as np


top4_data = df.groupby(['card1', 'isFraud']).size().unstack(fill_value=0).loc[card1_counts.index[:4]]

fraud_ratio = (top4_data[1] / (top4_data[0] + top4_data[1]) * 100).round(2)

ax = top4_data.plot(kind='bar', stacked=True, figsize=(8,6), color=['steelblue','tomato'])

for container in ax.containers:
    ax.bar_label(container, label_type='center', fontsize=9, color='white', fontweight='bold')

for idx, val in enumerate(fraud_ratio):
    total = top4_data.sum(axis=1).iloc[idx]
    ax.text(idx, total + 200, f"{val}%", ha='center', va='bottom', fontsize=10, fontweight='bold', color='black')

plt.title("Top 4 card1 with Most Transactions")
plt.xlabel("card1 ID")
plt.ylabel("Number of Transactions")
plt.legend(['Legit', 'Fraud'])
plt.show()


In [None]:
K = 20
topk_ids = df['card1'].value_counts().head(K).index

g = (df[df['card1'].isin(topk_ids)]
     .groupby('card1')['isFraud']
     .agg(total='size', fraud='sum'))
g['fraud_rate'] = g['fraud'] / g['total']

g_bar = g.sort_values('fraud_rate', ascending=False)

plt.figure(figsize=(10,6))
plt.bar(g_bar.index.astype(str), (g_bar['fraud_rate']*100).values)
plt.title(f"Fraud rate (%) - Top {K} card1 by transactions")
plt.xlabel("card1")
plt.ylabel("Fraud rate (%)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(9,6))
plt.scatter(g['total'], g['fraud_rate']*100)
plt.title(f"Total transactions vs Fraud rate (%) - Top {K} card1")
plt.xlabel("Total transactions")
plt.ylabel("Fraud rate (%)")

for rid, row in g.iterrows():
    plt.annotate(str(rid), (row['total'], row['fraud_rate']*100),
                 textcoords="offset points", xytext=(4,4), fontsize=8)
plt.tight_layout()
plt.show()

- user 9633 has the highest fraud transaction 

### card2

In [None]:
card2_counts = df['card2'].value_counts().head(20)

plt.figure(figsize=(10,5))
card2_counts.plot(kind='bar', color='teal')
plt.title("Top 20 card2 (Bank IDs)")
plt.xlabel("card2")
plt.ylabel("Number of Transactions")
plt.show()


In [None]:

card2_stats = df.groupby('card2')['isFraud'].agg(total='count', fraud='sum')
card2_stats['fraud_rate'] = card2_stats['fraud'] / card2_stats['total']

card2_sorted = card2_stats.sort_values('fraud_rate', ascending=False).head(50)

plt.figure(figsize=(12,6))
plt.bar(card2_sorted.index.astype(str), card2_sorted['fraud_rate']*100)
plt.title("Top 20 card2 with Highest Fraud Rate")
plt.xlabel("card2 (Bank ID)")
plt.ylabel("Fraud Rate (%)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


The highest-risk banks (e.g., 289.0, 405.0) show fraud rates above 40%, which is extremely high compared to the baseline.

In [None]:
fraud_pairs = (df.groupby(['card1','card2'])['isFraud']
                 .agg(total='count', fraud='sum'))
fraud_pairs['fraud_rate'] = fraud_pairs['fraud'] / fraud_pairs['total']
fraud_pairs = fraud_pairs.reset_index()


# top_pairs = fraud_pairs.sort_values('fraud', ascending=False).head(20)


top_card1 = fraud_pairs.groupby('card1')['fraud'].sum().sort_values(ascending=False).head(10).index
subset = fraud_pairs[fraud_pairs['card1'].isin(top_card1)]

# pivot cho heatmap
pivot = subset.pivot_table(values='fraud_rate', index='card1', columns='card2', fill_value=0)

plt.figure(figsize=(14,6))
sns.heatmap(pivot*100, cmap="Reds", cbar_kws={'label': 'Fraud Rate (%)'})
plt.title("Fraud Rate (%) by card1 & card2")
plt.xlabel("card2 (Bank ID)")
plt.ylabel("card1 (User ID)")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=fraud_pairs, x='total', y='fraud_rate', hue='card2', alpha=0.7, legend=False)
plt.title("Fraud Rate vs Total Transactions (by card1-card2)")
plt.xlabel("Total Transactions")
plt.ylabel("Fraud Rate")
plt.show()


In [None]:
df.head()

### card3

In [None]:
card3_counts = df['card3'].value_counts().head(20) 

plt.figure(figsize=(12,6))
sns.barplot(x=card3_counts.index.astype(str), y=card3_counts.values, color='steelblue')
plt.title("Top 20 card3 with Most Transactions")
plt.xlabel("card3")
plt.ylabel("Number of Transactions")
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
card3_stats = df.groupby('card3')['isFraud'].agg(total='count', fraud='sum')
card3_stats['fraud_rate'] = card3_stats['fraud'] / card3_stats['total']

# lấy top 20 theo fraud_rate
card3_sorted = card3_stats.sort_values('fraud_rate', ascending=False).head(60)

plt.figure(figsize=(12,6))
sns.barplot(x=card3_sorted.index.astype(str), y=card3_sorted['fraud_rate']*100, color='tomato')
plt.title("Top 20 card3 with Highest Fraud Rate")
plt.xlabel("card3")
plt.ylabel("Fraud Rate (%)")
plt.xticks(rotation=45, ha='right')
plt.show()

There are many types of card3 that show a 100% fraud rate, but these categories have only a very small number of transactions. This means the high fraud rate is likely due to low sample size rather than a genuine risk pattern, and should be interpreted with caution.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fraud_c23 = (df.groupby(['card2','card3'])['isFraud']
               .mean()
               .unstack(fill_value=0))

plt.figure(figsize=(20,10))
sns.heatmap(fraud_c23*100, cmap="Reds", cbar_kws={'label': 'Fraud Rate (%)'})
plt.title("Fraud Rate (%) by card2 (Bank) and card3 (Region)")
plt.xlabel("card3")
plt.ylabel("card2")
plt.show()


### card4

In [None]:

plt.figure(figsize=(6,4))
sns.countplot(x='card4', data=df, order=df['card4'].value_counts().index, palette="Set2")
plt.title("Distribution of Transactions by card4 (Card Brand)")
plt.xlabel("card4 (Brand)")
plt.ylabel("Number of Transactions")
plt.show()


card4_stats = df.groupby('card4')['isFraud'].agg(total='count', fraud='sum')
card4_stats['fraud_rate'] = card4_stats['fraud'] / card4_stats['total']

plt.figure(figsize=(6,4))
sns.barplot(x=card4_stats.index, y=card4_stats['fraud_rate']*100, palette="Set1")
plt.title("Fraud Rate (%) by card4 (Card Brand)")
plt.xlabel("card4 (Brand)")
plt.ylabel("Fraud Rate (%)")
plt.show()

While Visa has the highest number of transactions, the Discover card shows the highest risk with the highest fraud rate. This indicates that although Visa dominates in volume, fraud detection efforts should pay particular attention to Discover, as it is disproportionately associated with fraudulent activity relative to its transaction count.

### card6

In [None]:

plt.figure(figsize=(6,4))
sns.countplot(x='card6', data=df, order=df['card6'].value_counts().index, palette="Set2")
plt.title("Distribution of Transactions by card6 (Card Type)")
plt.xlabel("card6 (Type)")
plt.ylabel("Number of Transactions")
plt.show()


card6_stats = df.groupby('card6')['isFraud'].agg(total='count', fraud='sum')
card6_stats['fraud_rate'] = card6_stats['fraud'] / card6_stats['total']

plt.figure(figsize=(6,4))
sns.barplot(x=card6_stats.index, y=card6_stats['fraud_rate']*100, palette="Set1")
plt.title("Fraud Rate (%) by card6 (Card Type)")
plt.xlabel("card6 (Type)")
plt.ylabel("Fraud Rate (%)")
plt.show()



## Address features 

### P_emaildomain 

In [None]:
sns.boxplot(data=df, x='P_emaildomain', hue='isFraud', palette="Set3")

In [None]:
p_email_counts = df['P_emaildomain'].value_counts().head(50)

plt.figure(figsize=(10,5))
sns.barplot(x=p_email_counts.index, y=p_email_counts.values, color='steelblue')
plt.title("Top 10 Purchaser Email Domains (P_emaildomain)")
plt.xticks(rotation=45, ha='right')
plt.ylabel("Number of Transactions")
plt.show()

In [None]:
p_email_stats = df.groupby('P_emaildomain')['isFraud'].agg(total='count', fraud='sum')
p_email_stats['fraud_rate'] = p_email_stats['fraud'] / p_email_stats['total']
p_email_stats.sort_values('fraud_rate', ascending=False).head(10)


In [None]:


p_email_stats = df.groupby('P_emaildomain')['isFraud'].agg(total='count', fraud='sum')
p_email_stats['fraud_rate'] = p_email_stats['fraud'] / p_email_stats['total']
p_email_stats = p_email_stats.sort_values('total', ascending=False).head(200)  

fig, ax1 = plt.subplots(figsize=(20,10))

sns.barplot(x=p_email_stats.index, y=p_email_stats['total'], color='steelblue', ax=ax1)
ax1.set_ylabel("Number of Transactions", color="steelblue")
ax1.set_xlabel("Purchaser Email Domain")
ax1.set_xticklabels(p_email_stats.index, rotation=45, ha='right')

ax2 = ax1.twinx()
sns.lineplot(x=p_email_stats.index, y=p_email_stats['fraud_rate']*100, 
             marker='o', sort=False, color='tomato', ax=ax2)
ax2.set_ylabel("Fraud Rate (%)", color="tomato")

plt.title("Purchaser Email Domains: Transaction Volume & Fraud Rate")
plt.tight_layout()
plt.show()


The figure shows that protonmail.com has the highest observed fraud rate (around 40%), but this is based on a very small number of transactions, which may introduce bias. Similarly, domains such as mail.com, outlook.es, and aim.com record fraud rates above 15%, yet their sample sizes are also limited, so these results should be interpreted with caution.

In contrast, major “big tech” domains like gmail.com, yahoo.com, and hotmail.com account for the majority of transactions and exhibit fraud rates below 10%. This suggests that while fraud does occur across all email providers, rare or niche domains tend to show disproportionately higher fraud risk, although their impact on the overall dataset is smaller due to low transaction volume.

In [None]:

top5_domains = df['P_emaildomain'].value_counts().head(5).index.tolist()

def categorize_email(domain):
    if pd.isnull(domain):
        return "missing"
    domain = domain.lower()
    if domain in top5_domains:
        return "BigTech_Top5"
    else:
        return "Others"

df['email_group2'] = df['P_emaildomain'].apply(categorize_email)

email_stats2 = df.groupby('email_group2')['isFraud'].agg(total='count', fraud='sum')
email_stats2['fraud_rate'] = email_stats2['fraud'] / email_stats2['total']

print(email_stats2)

plt.figure(figsize=(6,4))
sns.barplot(x=email_stats2.index, y=email_stats2['fraud_rate']*100, palette="Set2")
plt.title("Fraud Rate (%) by Email Group (Top 5 vs Others)")
plt.xlabel("Email Group")
plt.ylabel("Fraud Rate (%)")
plt.show()


-> can use this like feature engineering -> reduce bias because small samples size , learn pattern more ez

In [None]:

email_group_stats2 = df.groupby(['email_group2','isFraud']).size().unstack(fill_value=0)
email_group_stats2.columns = ['Legit','Fraud']

ax = email_group_stats2.plot(kind='bar', stacked=True, figsize=(8,6), 
                             color=['steelblue','tomato'])

for container in ax.containers:
    ax.bar_label(container, label_type='center', color='white', fontsize=9, fontweight='bold')

plt.title("Transaction Distribution by Email Group (Top 5 vs Others)")
plt.xlabel("Email Group")
plt.ylabel("Number of Transactions")
plt.legend(["Legit","Fraud"])
plt.xticks(rotation=0)
plt.show()


### R_emaildomain

In [None]:
p_email_counts = df['R_emaildomain'].value_counts().head(50)

plt.figure(figsize=(10,5))
sns.barplot(x=p_email_counts.index, y=p_email_counts.values, color='steelblue')
plt.title("Top 10 Purchaser Email Domains (R_emaildomain)")
plt.xticks(rotation=45, ha='right')
plt.ylabel("Number of Transactions")
plt.show()

In [None]:
r_email_stats = df.groupby('R_emaildomain')['isFraud'].agg(total='count', fraud='sum')
r_email_stats['fraud_rate'] = r_email_stats['fraud'] / r_email_stats['total']
r_email_stats.sort_values('fraud_rate', ascending=False).head(10)

In [None]:


r_email_stats = df.groupby('R_emaildomain')['isFraud'].agg(total='count', fraud='sum')
r_email_stats['fraud_rate'] = r_email_stats['fraud'] / r_email_stats['total']
r_email_stats = r_email_stats.sort_values('total', ascending=False).head(200)

fig, ax1 = plt.subplots(figsize=(20,10))

sns.barplot(x=r_email_stats.index, y=r_email_stats['total'], color='steelblue', ax=ax1)
ax1.set_ylabel("Number of Transactions", color="steelblue")
ax1.set_xlabel("Receiver Email Domain")
ax1.set_xticklabels(r_email_stats.index, rotation=45, ha='right')

ax2 = ax1.twinx()
sns.lineplot(x=r_email_stats.index, y=r_email_stats['fraud_rate']*100, 
             marker='o', sort=False, color='tomato', ax=ax2)
ax2.set_ylabel("Fraud Rate (%)", color="tomato")

plt.title("Receiver Email Domains: Transaction Volume & Fraud Rate")
plt.tight_layout()
plt.show()


In [None]:


top5_domains_r = df['R_emaildomain'].value_counts().head(5).index.tolist()

def categorize_email_r(domain):
    if pd.isnull(domain):
        return "missing"
    domain = domain.lower()
    if domain in top5_domains_r:
        return "BigTech_Top5"
    else:
        return "Others"

df['email_group2_R'] = df['R_emaildomain'].apply(categorize_email_r)

email_stats2_r = df.groupby('email_group2_R')['isFraud'].agg(total='count', fraud='sum')
email_stats2_r['fraud_rate'] = email_stats2_r['fraud'] / email_stats2_r['total']

print(email_stats2_r)

plt.figure(figsize=(6,4))
sns.barplot(x=email_stats2_r.index, y=email_stats2_r['fraud_rate']*100, palette="Set2")
plt.title("Fraud Rate (%) by R_emaildomain Group (Top 5 vs Others)")
plt.xlabel("Email Group (Recipient)")
plt.ylabel("Fraud Rate (%)")
plt.show()

In [None]:
email_group_stats2_r = df.groupby(['email_group2_R','isFraud']).size().unstack(fill_value=0)
email_group_stats2_r.columns = ['Legit','Fraud']

ax = email_group_stats2_r.plot(kind='bar', stacked=True, figsize=(8,6), 
                               color=['steelblue','tomato'])

for container in ax.containers:
    ax.bar_label(container, label_type='center', color='white', fontsize=9, fontweight='bold')

plt.title("Transaction Distribution by R_emaildomain Group (Top 5 vs Others)")
plt.xlabel("Recipient Email Group")
plt.ylabel("Number of Transactions")
plt.legend(["Legit","Fraud"])
plt.xticks(rotation=0)
plt.show()

In [None]:
df['email_match'] = (df['P_emaildomain'] == df['R_emaildomain']).astype(int)


email_match_stats = df.groupby('email_match')['isFraud'].agg(total='count', fraud='sum')
email_match_stats['fraud_rate'] = email_match_stats['fraud'] / email_match_stats['total']
print(email_match_stats)

plt.figure(figsize=(6,4))
sns.barplot(x=email_match_stats.index, y=email_match_stats['fraud_rate']*100, palette="Set1")
plt.xticks([0,1], ["Mismatch","Match"])
plt.title("Fraud Rate by Email Domain Match vs Mismatch")
plt.ylabel("Fraud Rate (%)")
plt.show()

In [None]:
top5_domains = df['P_emaildomain'].value_counts().head(5).index.tolist()

def categorize_email(domain):
    if pd.isnull(domain):
        return "missing"
    domain = domain.lower()
    if domain in top5_domains:
        return "BigTech_Top5"
    else:
        return "Others"

df['P_group'] = df['P_emaildomain'].apply(categorize_email)
df['R_group'] = df['R_emaildomain'].apply(categorize_email)

cross_tab = pd.crosstab(df['P_group'], df['R_group'], 
                        values=df['isFraud'], aggfunc='mean').fillna(0) * 100

plt.figure(figsize=(6,4))
sns.heatmap(cross_tab, annot=True, fmt=".2f", cmap="Reds")
plt.title("Fraud Rate (%) by P_emaildomain vs R_emaildomain Group")
plt.ylabel("Purchaser Email Group")
plt.xlabel("Recipient Email Group")
plt.show()


In [None]:
cross_tab_counts = pd.crosstab(df['P_group'], df['R_group'])
cross_tab_rate = pd.crosstab(df['P_group'], df['R_group'], 
                             values=df['isFraud'], aggfunc='mean').fillna(0) * 100


cross_tab_log = np.log1p(cross_tab_counts) 
plt.figure(figsize=(6,4))
sns.heatmap(cross_tab_log, annot=True, fmt=".1f", cmap="Blues")
plt.title("Log(Transactions) by P_emaildomain vs R_emaildomain Group")
plt.show()


## Counting feature

In [None]:
df.head()

In [None]:
counting_df = df[['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14' , 'isFraud']] 

In [None]:
# heatmap of correlations

plt.figure(figsize=(12,8))
sns.heatmap(counting_df.corr(), annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={'label': 'Correlation'})
plt.title("Correlation Heatmap of C1 to C14 Features")
plt.show()

In [None]:
# scatter plot of c1 , c2 with isfraud
sns.relplot(data= counting_df , x= 'C1' , y= 'C2' ,hue= 'isFraud') 

In [None]:
sns.relplot(data= counting_df , x= 'C1' , y= 'C3' ,hue= 'isFraud') 

### PCA - Visualize data

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare the data
X = df[['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']]
feature_names = X.columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit PCA
pca = PCA(n_components=14)  # All components to see full picture
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame to track contributions
components_df = pd.DataFrame(
    pca.components_.T,  # Transpose to have features as rows
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=feature_names
)

print("=== PCA COMPONENT ANALYSIS ===\n")

# Show explained variance ratio
print("Explained Variance Ratio by Component:")
for i, ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {ratio:.4f} ({ratio*100:.2f}%)")

print(f"\nCumulative explained variance (first 3 components): {pca.explained_variance_ratio_[:3].sum():.4f}")

# Show top contributing features for each component
print("\n=== TOP CONTRIBUTING FEATURES FOR EACH COMPONENT ===")
for i in range(min(5, pca.n_components_)):  # Show first 5 components
    pc_name = f'PC{i+1}'
    print(f"\n{pc_name} (explains {pca.explained_variance_ratio_[i]*100:.2f}% variance):")
    
    # Get absolute values and sort
    contributions = components_df[pc_name].abs().sort_values(ascending=False)
    
    print("Top 5 contributing features:")
    for j, (feature, contribution) in enumerate(contributions.head(5).items()):
        original_value = components_df.loc[feature, pc_name]  # Get original sign
        print(f"  {j+1}. {feature}: {original_value:.4f} (|{contribution:.4f}|)")

In [None]:
# Visualize component contributions
plt.figure(figsize=(16, 12))

# Plot 1: Heatmap of all components
plt.subplot(2, 2, 1)
sns.heatmap(components_df.iloc[:, :8].T, annot=True, fmt='.3f', cmap='RdBu_r', center=0,
            cbar_kws={'label': 'Component Loading'})
plt.title('PCA Component Loadings (First 8 Components)')
plt.xlabel('Original Features')
plt.ylabel('Principal Components')

# Plot 2: Bar plot for PC1
plt.subplot(2, 2, 2)
pc1_contributions = components_df['PC1'].sort_values(key=abs, ascending=False)
colors = ['red' if x < 0 else 'blue' for x in pc1_contributions]
plt.bar(range(len(pc1_contributions)), pc1_contributions.values, color=colors, alpha=0.7)
plt.xticks(range(len(pc1_contributions)), pc1_contributions.index, rotation=45)
plt.title(f'PC1 Feature Contributions (Explains {pca.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel('Loading')
plt.grid(True, alpha=0.3)

# Plot 3: Bar plot for PC2
plt.subplot(2, 2, 3)
pc2_contributions = components_df['PC2'].sort_values(key=abs, ascending=False)
colors = ['red' if x < 0 else 'blue' for x in pc2_contributions]
plt.bar(range(len(pc2_contributions)), pc2_contributions.values, color=colors, alpha=0.7)
plt.xticks(range(len(pc2_contributions)), pc2_contributions.index, rotation=45)
plt.title(f'PC2 Feature Contributions (Explains {pca.explained_variance_ratio_[1]*100:.1f}%)')
plt.ylabel('Loading')
plt.grid(True, alpha=0.3)

# Plot 4: Cumulative explained variance
plt.subplot(2, 2, 4)
cumvar = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1, len(cumvar)+1), cumvar, 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance')
plt.grid(True, alpha=0.3)
# Add lines for common thresholds
plt.axhline(y=0.8, color='r', linestyle='--', alpha=0.7, label='80%')
plt.axhline(y=0.95, color='g', linestyle='--', alpha=0.7, label='95%')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Detailed analysis for the first 3 components (most important)
print("\n=== DETAILED ANALYSIS OF TOP 3 COMPONENTS ===")

for i in range(3):
    pc_name = f'PC{i+1}'
    print(f"\n{'-'*50}")
    print(f"{pc_name} - Explains {pca.explained_variance_ratio_[i]*100:.2f}% of variance")
    print(f"{'-'*50}")
    
    # All contributions for this component
    contributions = components_df[pc_name].sort_values(key=abs, ascending=False)
    
    print("All feature contributions (sorted by absolute value):")
    for feature, contribution in contributions.items():
        print(f"  {feature}: {contribution:+.4f}")
    
    # Interpretation
    positive_features = contributions[contributions > 0.2].index.tolist()
    negative_features = contributions[contributions < -0.2].index.tolist()
    
    if positive_features:
        print(f"\nStrongly positive features (>0.2): {positive_features}")
    if negative_features:
        print(f"Strongly negative features (<-0.2): {negative_features}")

In [None]:
# Create a summary table for easy reference
summary_table = pd.DataFrame()

for i in range(min(5, pca.n_components_)):
    pc_name = f'PC{i+1}'
    
    # Get top 3 positive and negative contributors
    contributions = components_df[pc_name].sort_values(ascending=False)
    
    top_positive = contributions.head(3)
    top_negative = contributions.tail(3)
    
    summary_table[f'{pc_name}_Positive'] = [f"{idx}: {val:.3f}" for idx, val in top_positive.items()]
    summary_table[f'{pc_name}_Negative'] = [f"{idx}: {val:.3f}" for idx, val in top_negative.items()]

summary_table.index = ['1st', '2nd', '3rd']

print("\n=== SUMMARY TABLE: TOP CONTRIBUTORS BY COMPONENT ===")
print(summary_table)

# Save component information for later use
components_info = {
    'explained_variance_ratio': pca.explained_variance_ratio_,
    'components_df': components_df,
    'feature_names': feature_names,
    'cumulative_variance': np.cumsum(pca.explained_variance_ratio_)
}

print(f"\n=== KEY INSIGHTS ===")
print(f"• First 2 components explain {pca.explained_variance_ratio_[:2].sum()*100:.1f}% of variance")
print(f"• First 3 components explain {pca.explained_variance_ratio_[:3].sum()*100:.1f}% of variance")
print(f"• Need {np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.8) + 1} components for 80% variance")
print(f"• Need {np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1} components for 95% variance")

## Time Delta features 

In [None]:
for col in ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9' , 'D10', 'D11', 'D12', 'D13', 'D14' , 'D15']:
    if col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")
        print(f'sample values: {df[col].dropna().unique()[:5]}')
    else:
        print(f"{col} not in dataframe")

In [None]:
# Distribution of D1 to D15 
d_cols = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9' , 'D10', 'D11', 'D12', 'D13', 'D14' , 'D15']
plt.figure(figsize=(20, 15))
for i, col in enumerate(d_cols):
    if col in df.columns:
        plt.subplot(4, 4, i+1)
        sns.histplot(df[col].dropna(), bins=30, kde=True, color='steelblue')
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:
# boxplot of D1 to D15 by isFraud
plt.figure(figsize=(20, 15))
for i, col in enumerate(d_cols):
    if col in df.columns:
        plt.subplot(4, 4, i+1)
        sns.boxplot(data=df, x='isFraud', y=col, palette="Set2")
        plt.title(f'{col} by Fraud Status')
        plt.xlabel('isFraud')
        plt.ylabel(col)
        plt.yscale('linear')
plt.tight_layout()
plt.show()


In [None]:
# correlation heatmap of D1 to D15
plt.figure(figsize=(12,8))
sns.heatmap(df[d_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={'label': 'Correlation'})
plt.title("Correlation Heatmap of D1 to D15 Features")
plt.show()

In [None]:
#  scattrer plot of D1 vs D2 colored by isFraud
plt.figure(figsize=(8,6))   
sns.scatterplot(data=df, x='D1', y='D2', hue='isFraud', alpha=0.5, palette={0:'blue', 1:'red'})
plt.title("D1 vs D2 Colored by Fraud Status")
plt.xlabel("D1")
plt.ylabel("D2")
plt.yscale('linear')
plt.show()


In [None]:
# scatter plot of D6 vs D4 colored by isFraud
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='D6', y='D4', hue='isFraud', alpha=0.5, palette={0:'blue', 1:'red'})
plt.title("D6 vs D4 Colored by Fraud Status")
plt.xlabel("D6")
plt.ylabel("D4")
plt.yscale('linear')
plt.show()


In [None]:
# scatter plot of D6 vs D12 colored by isFraud
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='D6', y='D12', hue='isFraud', alpha=0.5, palette={0:'blue', 1:'red'})
plt.title("D6 vs D12 Colored by Fraud Status")
plt.xlabel("D6")
plt.ylabel("D12")
plt.yscale('linear')
plt.show()


In [None]:
# scatter plot of D12 vs D4 colored by isFraud
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='D12', y='D4', hue='isFraud', alpha=0.5, palette={0:'blue', 1:'red'})
plt.title("D12 vs D4 Colored by Fraud Status")
plt.xlabel("D12")
plt.ylabel("D4")
plt.yscale('linear')
plt.show()

In [None]:
# scatter plot of D4 and D5 colored by isFraud
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='D4', y='D5', hue='isFraud', alpha=0.5, palette={0:'blue', 1:'red'})
plt.title("D4 vs D5 Colored by Fraud Status")
plt.xlabel("D4")
plt.ylabel("D5")
plt.yscale('linear')
plt.show()


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# Prepare Time Delta data
d_cols = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15']
X_timedelta = df[d_cols].copy()

print("=== TIME DELTA FEATURES ANALYSIS ===\n")
print("Original shape:", X_timedelta.shape)
print("Missing values per feature:")
print(X_timedelta.isnull().sum())
print()

# Handle missing values - important for time delta features
imputer = SimpleImputer(strategy='median')  # median better for time features
X_timedelta_imputed = pd.DataFrame(
    imputer.fit_transform(X_timedelta), 
    columns=d_cols,
    index=X_timedelta.index
)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_timedelta_imputed)

print("Data after preprocessing:")
print("Shape:", X_scaled.shape)
print("No missing values:", not np.isnan(X_scaled).any())

In [None]:
 

print("=== PCA RESULTS FOR TIME DELTA FEATURES ===\n")
print("Explained Variance Ratio:")
for i, ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {ratio:.4f} ({ratio*100:.2f}%)")

cumvar = np.cumsum(pca.explained_variance_ratio_)
print(f"\nCumulative variance:")
print(f"First 3 components: {cumvar[2]:.4f} ({cumvar[2]*100:.1f}%)")
print(f"First 5 components: {cumvar[4]:.4f} ({cumvar[4]*100:.1f}%)")

# Show top contributors for first 3 components
print("\n=== TOP CONTRIBUTING FEATURES ===")
for i in range(3):
    pc_name = f'PC{i+1}'
    contributions = components_df[pc_name].abs().sort_values(ascending=False)
    print(f"\n{pc_name} (explains {pca.explained_variance_ratio_[i]*100:.2f}%):")
    for j, (feature, contribution) in enumerate(contributions.head(5).items()):
        original_value = components_df.loc[feature, pc_name]
        print(f"  {j+1}. {feature}: {original_value:+.4f}")

In [None]:
# 2. Visualize PCA Results
plt.figure(figsize=(16, 12))

# Plot 1: Scree plot
plt.subplot(2, 3, 1)
plt.plot(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_, 'bo-')
plt.xlabel('Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot - Time Delta Features')
plt.grid(True, alpha=0.3)

# Plot 2: Cumulative variance
plt.subplot(2, 3, 2)
plt.plot(range(1, len(cumvar)+1), cumvar, 'ro-')
plt.axhline(y=0.8, color='g', linestyle='--', alpha=0.7, label='80%')
plt.axhline(y=0.95, color='b', linestyle='--', alpha=0.7, label='95%')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Variance')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 3: Heatmap of loadings
plt.subplot(2, 3, 3)
sns.heatmap(components_df.iloc[:, :8].T, annot=True, fmt='.2f', cmap='RdBu_r', center=0)
plt.title('PCA Loadings (First 8 Components)')
plt.xlabel('Time Delta Features')

# Plot 4: PC1 vs PC2 scatter
plt.subplot(2, 3, 4)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['isFraud'], cmap='viridis', alpha=0.6)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')
plt.title('PC1 vs PC2 (colored by Fraud)')
plt.colorbar(scatter)

# Plot 5: PC1 contributions
plt.subplot(2, 3, 5)
pc1_contrib = components_df['PC1'].sort_values(key=abs, ascending=False)
colors = ['red' if x < 0 else 'blue' for x in pc1_contrib]
plt.bar(range(len(pc1_contrib)), pc1_contrib.values, color=colors, alpha=0.7)
plt.xticks(range(len(pc1_contrib)), pc1_contrib.index, rotation=45)
plt.title('PC1 Feature Contributions')
plt.ylabel('Loading')

# Plot 6: PC2 contributions  
plt.subplot(2, 3, 6)
pc2_contrib = components_df['PC2'].sort_values(key=abs, ascending=False)
colors = ['red' if x < 0 else 'blue' for x in pc2_contrib]
plt.bar(range(len(pc2_contrib)), pc2_contrib.values, color=colors, alpha=0.7)
plt.xticks(range(len(pc2_contrib)), pc2_contrib.index, rotation=45)
plt.title('PC2 Feature Contributions')
plt.ylabel('Loading')

plt.tight_layout()
plt.show()

In [None]:
df.head()

## Matching Feature

In [None]:
# matching feature value 

matching_cols = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9'] 

for col in matching_cols:
    if col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")
        print(f'sample values: {df[col].dropna().unique()[:5]}')
    else:
        print(f"{col} not in dataframe")

In [None]:
#missing value in matching cols
print("\nMissing values in Matching Features:")
print(df[matching_cols].isnull().sum())
print()


In [None]:
# distribution of M1 to M9
plt.figure(figsize=(30, 20))
for i, col in enumerate(matching_cols):
    if col in df.columns:
        plt.subplot(3, 3, i+1)
        sns.countplot(data=df, x=col, order=df[col].value_counts().index, palette="Set3" , hue='isFraud')
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45)

## V columns 

In [None]:
df[[col for col in df.columns if col.startswith('V')]].describe()

In [None]:
v_cols = [col for col in df.columns if col.startswith('V')]


In [None]:
for col in v_cols:
    if col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")
        print(f'sample values: {df[col].dropna().unique()[:5]}')
    else:
        print(f"{col} not in dataframe")

In [None]:
v_cols = [col for col in df.columns if col.startswith('V')] 


In [None]:

df[[col for col in df.columns if col.startswith('V')]].describe()
    

In [None]:
plt.figure(figsize=(15, 12))
sns.heatmap(df[ [f'V{i}' for i in range(1, 337 + 1)] + ['isFraud']].corr(), annot=False, cmap='coolwarm')

In [None]:
def calculate_iv(data: pd.DataFrame, feature: str, target: str):
    crosstab = pd.crosstab(data[feature], data[target], normalize=False)
    crosstab.columns = ['Good', 'Bad']
    crosstab['Total'] = crosstab['Good'] + crosstab['Bad']
    crosstab['Good%'] = crosstab['Good'] / crosstab['Good'].sum()
    crosstab['Bad%'] = crosstab['Bad'] / crosstab['Bad'].sum()
    crosstab = crosstab[(crosstab['Good%'] > 0) & (crosstab['Bad%'] > 0)]
    crosstab['WOE'] = np.log(crosstab['Good%'] / crosstab['Bad%'])
    crosstab['IV'] = (crosstab['Good%'] - crosstab['Bad%']) * crosstab['WOE']
    return crosstab['IV'].sum()  

In [None]:
def calculate_iv(data: pd.DataFrame, feature: str, target: str):
    crosstab = pd.crosstab(data[feature], data[target], normalize=False)
    crosstab.columns = ['Good', 'Bad']
    crosstab['Total'] = crosstab['Good'] + crosstab['Bad']
    crosstab['Good%'] = crosstab['Good'] / crosstab['Good'].sum()
    crosstab['Bad%'] = crosstab['Bad'] / crosstab['Bad'].sum()
    crosstab = crosstab[(crosstab['Good%'] > 0) & (crosstab['Bad%'] > 0)]
    crosstab['WOE'] = np.log(crosstab['Good%'] / crosstab['Bad%'])
    crosstab['IV'] = (crosstab['Good%'] - crosstab['Bad%']) * crosstab['WOE']
    return crosstab['IV'].sum()

In [None]:
# top 10 features by IV
iv_values = {}
for col in df.columns:
    if col != 'isFraud':
        try:
            iv = calculate_iv(df, col, 'isFraud')
            iv_values[col] = iv
        except Exception as e:
            print(f"Could not calculate IV for {col}: {e}")


# strong predictive power if IV > 0.3 and < 0.5
iv_series = pd.Series(iv_values).sort_values(ascending=False)
strong_predictive = iv_series[(iv_series > 0.3) & (iv_series < 0.5)]
strong_predictive.head(10)

        

There top 10 feature with strong predictive power -> reduce dimension  

In [None]:
# plot top 10 features by IV
plt.figure(figsize=(10,6))
sns.barplot(x=strong_predictive.head(10).values, y=strong_predictive.head(10).index, palette="viridis")
plt.title("Top 10 Features by Information Value (IV)")
plt.xlabel("Information Value (IV)")
plt.ylabel("Feature")
plt.show()


## Feature Identity 

### Device type

In [None]:
# print sample value for device type and device info
print(f"DeviceType: {df['DeviceType'].nunique()} unique values")
print(f'sample values: {df["DeviceType"].dropna().unique()[:5]}')
print(f"DeviceInfo: {df['DeviceInfo'].nunique()} unique values")
print(f'sample values: {df["DeviceInfo"].dropna().unique()[:5]}')


In [None]:
# plot stack bar of device type by isFraud
device_type_stats = df.groupby(['DeviceType', 'isFraud']).size().unstack(fill_value=0)
device_type_stats.columns = ['Legit', 'Fraud']
ax = device_type_stats.plot(kind='bar', stacked=True, figsize=(8,6), color=['steelblue','tomato'])
for container in ax.containers:
    ax.bar_label(container, label_type='center', color='white', fontsize=9, fontweight='bold')
plt.title("Transaction Distribution by Device Type")
plt.xlabel("Device Type")
plt.ylabel("Number of Transactions")
plt.legend(["Legit","Fraud"])
plt.xticks(rotation=0)
plt.show()


In [None]:
# sample value of device info
print(f"DeviceInfo: {df['DeviceInfo'].nunique()} unique values")
print(f'sample values: {df["DeviceInfo"].dropna().unique()[:50]}')

In [None]:

pd.Series(df['id_31'].unique()).sample(5, random_state=42)


In [None]:
pd.Series(df['id_30'].unique()).sample(5, random_state=42)
