MAIN PACKAGES AND LIBRAIRIES


In [None]:

# To install

%pip install kagglehub
    

# To import 

import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import pickle
import warnings
warnings.filterwarnings('ignore')

# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")
print("Path to dataset files:", path)

ANALYSIS


In [None]:
# Load the dataset

df_train = pd.read_csv(f"{path}\\fraudTrain.csv")
df_test = pd.read_csv(f"{path}\\fraudTest.csv")

# First analysis

print("test columns : \n \n",df_test.columns)
print("\n \n train columns : \n \n",df_train.columns)

# Remove unnecessary index
if 'Unnamed: 0' in df_train.columns:
    df_train = df_train.drop('Unnamed: 0', axis=1)
    df_test = df_test.drop('Unnamed: 0', axis=1)

print(f"\n TRAIN: {df_train.shape[0]:,} rows × {df_train.shape[1]} columns")
print(f" TEST: {df_test.shape[0]:,} rows × {df_test.shape[1]} columns")
print(f"\n Columns:\n{list(df_train.columns)}")

df = df_train.copy()

# ============================================================================
# 2. DATA QUALITY
# ============================================================================

print("\n" + "="*80)
print("2. DATA QUALITY")
print("="*80)

# Missing values
missing_train = df_train.isnull().sum().sum()
missing_test = df_test.isnull().sum().sum()
print(f"\n Missing values - TRAIN: {missing_train}, TEST: {missing_test}")

# Duplicates
dup_train = df_train.duplicated().sum()
dup_test = df_test.duplicated().sum()
print(f" Duplicates - TRAIN: {dup_train}, TEST: {dup_test}")

# Data types
print(f"\n Data types:")
print(df.dtypes.value_counts())

# ============================================================================
# 3. CLASS IMBALANCE (CRITICAL)
# ============================================================================

print("\n" + "="*80)
print("3. CLASS IMBALANCE - TARGET 'is_fraud'")
print("="*80)

target_col = 'is_fraud'

# TRAIN distribution
class_dist_train = df_train[target_col].value_counts().sort_index()
class_pct_train = (df_train[target_col].value_counts(normalize=True).sort_index() * 100)
imbalance_ratio_train = class_dist_train[0] / class_dist_train[1]

print(f"\n TRAIN:")
print(f"   Legitimate (0): {class_dist_train[0]:,} ({class_pct_train[0]:.3f}%)")
print(f"   Fraud (1): {class_dist_train[1]:,} ({class_pct_train[1]:.3f}%)")
print(f"   Imbalance ratio: {imbalance_ratio_train:.0f}:1")

# TEST distribution
class_dist_test = df_test[target_col].value_counts().sort_index()
class_pct_test = (df_test[target_col].value_counts(normalize=True).sort_index() * 100)
imbalance_ratio_test = class_dist_test[0] / class_dist_test[1]

print(f"\n TEST:")
print(f"   Legitimate (0): {class_dist_test[0]:,} ({class_pct_test[0]:.3f}%)")
print(f"   Fraud (1): {class_dist_test[1]:,} ({class_pct_test[1]:.3f}%)")
print(f"   Imbalance ratio: {imbalance_ratio_test:.0f}:1")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar([0, 1], class_dist_train.values, color=['#2ecc71', '#e74c3c'], 
           edgecolor='black', linewidth=1.5)
axes[0].set_title('TRAIN Distribution', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Class', fontsize=11)
axes[0].set_ylabel('Count', fontsize=11)
axes[0].set_xticks([0, 1])
axes[0].set_xticklabels(['Legitimate', 'Fraud'])
for i, v in enumerate(class_dist_train.values):
    axes[0].text(i, v, f'{v:,}\n({class_pct_train.values[i]:.3f}%)', 
               ha='center', va='bottom', fontweight='bold')

axes[1].bar([0, 1], class_dist_test.values, color=['#3498db', '#e67e22'], 
           edgecolor='black', linewidth=1.5)
axes[1].set_title('TEST Distribution', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Class', fontsize=11)
axes[1].set_ylabel('Count', fontsize=11)
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['Legitimate', 'Fraud'])
for i, v in enumerate(class_dist_test.values):
    axes[1].text(i, v, f'{v:,}\n({class_pct_test.values[i]:.3f}%)', 
               ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('class_imbalance.png', dpi=300, bbox_inches='tight')
plt.show()

# ============================================================================
# 4. KEY FEATURES ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("4. KEY FEATURES")
print("="*80)

# Identify feature types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

if target_col in numeric_cols:
    numeric_cols.remove(target_col)

print(f"\n Feature types:")
print(f"   Numeric: {len(numeric_cols)}")
print(f"   Categorical: {len(categorical_cols)}")

# Amount analysis
print(f"\n Amount (amt) statistics:")
print(f"   Mean: ${df['amt'].mean():.2f}")
print(f"   Median: ${df['amt'].median():.2f}")
print(f"   Std: ${df['amt'].std():.2f}")
print(f"   Range: ${df['amt'].min():.2f} - ${df['amt'].max():.2f}")

print(f"\n   By class:")
for cls in [0, 1]:
    label = "Legitimate" if cls == 0 else "Fraud"
    mean_amt = df[df[target_col] == cls]['amt'].mean()
    print(f"   {label}: ${mean_amt:.2f}")

# Temporal feature
df['trans_datetime'] = pd.to_datetime(df['trans_date_trans_time'])
df_test['trans_datetime'] = pd.to_datetime(df_test['trans_date_trans_time'])
df_train['trans_datetime'] = pd.to_datetime(df_train['trans_date_trans_time'])

print(f"\n Time period:")
print(f"   From: {df['trans_datetime'].min()}")
print(f"   To: {df['trans_datetime'].max()}")
print(f"   Duration: {(df['trans_datetime'].max() - df['trans_datetime'].min()).days} days")


# ============================================================================
# 5. BASIC STATISTICS
# ============================================================================

print("\n" + "="*80)
print("5. DESCRIPTIVE STATISTICS")
print("="*80)

print("\n Numeric features summary:")
print(df[numeric_cols].describe().T[['mean', 'std', 'min', 'max']])

# ============================================================================
# 6. KEY INSIGHTS & NEXT STEPS
# ============================================================================

print("\n" + "="*80)
print("6. KEY INSIGHTS & NEXT STEPS")
print("="*80)

print("\n KEY FINDINGS:")
print(f"   1. Severe class imbalance: ~{imbalance_ratio_train:.0f}:1 ratio")
print(f"   2. {len(numeric_cols)} numeric features, {len(categorical_cols)} categorical")
print(f"   3. High cardinality in merchant, job → Need encoding strategy")
print(f"   4. Temporal data available for feature engineering")
print(f"   5. No missing values (excellent data quality)")

print("\n PREPROCESSING ROADMAP:")
print("   1. Drop PII: cc_num, first, last, street, trans_num")
print("   2. Temporal features: extract hour, day, day_of_week from trans_date_trans_time")
print("   3. Encode categoricals:")
print("      • Low cardinality (gender, state): One-Hot or Label Encoding")
print("      • High cardinality (category, merchant, job): Target Encoding")
print("   4. Scale numeric features: amt, lat, long, city_pop")
print("   5. Feature engineering: customer-merchant distance, age from dob")
print("   6. Handle imbalance: SMOTE, class_weight, or undersampling")

print("\n MODELING STRATEGY:")
print("   • Validation: StratifiedKFold (5-fold)")
print("   • Metrics: F1-Score (primary), PR-AUC, ROC-AUC")
print("   • Models: Random Forest, Gradient Boosting, XGBoost")
print("   • Optimize threshold based on business cost (FP vs FN)")

print("\n" + "="*80)
print(" EXPLORATION COMPLETED - Ready for preprocessing!")
print("="*80)


df_train.head()
