# Fraud Data Preprocessing Pipeline (from raw data to model-ready dataframe)


In [1]:
# --- 1. Imports ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [3]:
# --- 2. Simulate a raw dataset ---
np.random.seed(42)

n = 10000  # number of samples
raw_df = pd.DataFrame({
    'transaction_id': np.arange(n),
    'country': np.random.choice(['Brazil', 'USA', 'UK', 'Germany'], n),
    'city': np.random.choice(['Rio', 'New York', 'London', 'Berlin'], n),
    'district': np.random.choice(['Center', 'North', 'South', 'West'], n),
    'zip': np.random.randint(10000, 99999, n),
    'ip': [f"192.168.{np.random.randint(0,255)}.{np.random.randint(0,255)}" for _ in range(n)],
    'datetime': pd.date_range('2024-01-01', periods=n, freq='min'),
    'os': np.random.choice(['Windows', 'Android', 'iOS'], n),
    'value': np.random.exponential(scale=100, size=n).round(2),
    'background_checks': np.random.randint(0, 5, n),
    'complaints': np.random.randint(0, 10, n),
    'transactions': np.random.randint(1, 50, n),
    'credit_score': np.random.normal(650, 100, n).astype(int),
    'credit_limit': np.random.uniform(500, 20000, n).round(2),
    'device': np.random.choice(['mobile', 'desktop'], n),
    'browser': np.random.choice(['Chrome', 'Safari', 'Firefox', 'Edge'], n),
    'is_fraud': np.random.choice([0, 1], n, p=[0.985, 0.015])  # imbalanced target
})

display("Raw dataset sample:")
display(raw_df.head())

'Raw dataset sample:'

Unnamed: 0,transaction_id,country,city,district,zip,ip,datetime,os,value,background_checks,complaints,transactions,credit_score,credit_limit,device,browser,is_fraud
0,0,UK,London,South,35499,192.168.140.92,2024-01-01 00:00:00,iOS,121.49,0,3,19,768,6246.51,desktop,Chrome,0
1,1,Germany,Berlin,West,16421,192.168.106.108,2024-01-01 00:01:00,Windows,182.55,2,4,20,601,18677.37,desktop,Chrome,0
2,2,Brazil,London,West,62204,192.168.87.249,2024-01-01 00:02:00,Windows,36.29,0,9,11,655,14260.93,mobile,Chrome,0
3,3,UK,Berlin,North,34591,192.168.44.107,2024-01-01 00:03:00,Windows,1.48,2,1,24,603,2218.27,mobile,Chrome,0
4,4,UK,Berlin,Center,70940,192.168.45.76,2024-01-01 00:04:00,iOS,80.12,0,9,22,534,13281.11,desktop,Chrome,0


In [None]:
# --- 3. Feature extraction from datetime ---
raw_df['day'] = raw_df['datetime'].dt.day
raw_df['hour'] = raw_df['datetime'].dt.hour
raw_df['minute'] = raw_df['datetime'].dt.minute
raw_df.drop(columns=['datetime'], inplace=True)
raw_df.head()

In [None]:

# --- 4. Handle missing values (for demonstration, inject some NaNs) ---
for col in ['city', 'district', 'os']:
    raw_df.loc[raw_df.sample(frac=0.02).index, col] = np.nan

raw_df['city'] = raw_df['city'].fillna('Unknown')
raw_df['district'] = raw_df['district'].fillna('Unknown')
raw_df['os'] = raw_df['os'].fillna('Unknown')

# --- 5. Encoding categorical variables using One-Hot Encode ---
cat_cols = ['country', 'city', 'district', 'os', 'device', 'browser']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(raw_df[cat_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols))

# Merge encoded columns back
df = pd.concat([raw_df.drop(columns=cat_cols), encoded_df], axis=1)

# --- 6. Create engineered features ---
df['security_index'] = df['background_checks'] * 0.7 + df['complaints'] * 0.3
df['avg_value_per_tx'] = df['value'] / df['transactions']

# --- 7. Scale numerical features ---
num_cols = df.drop(columns=['transaction_id', 'ip', 'is_fraud']).select_dtypes(include=np.number).columns
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[num_cols] = scaler.fit_transform(df[num_cols])

# --- 8. (Optional) PCA for dimensionality reduction ---
pca = PCA(n_components=20)
pca_features = pca.fit_transform(df_scaled[num_cols])

df_pca = pd.DataFrame(pca_features, columns=[f'pca_{i+1}' for i in range(20)])

# Combine PCA features + target
final_df = pd.concat([df_pca, df_scaled[['value', 'is_fraud']].reset_index(drop=True)], axis=1)

print("\nPreprocessed dataframe sample:")
print(final_df.head())

# --- 9. Split into training and test sets ---
X = final_df.drop(columns=['is_fraud'])
y = final_df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"\nFinal dataset shape: {final_df.shape}")
print(f"Fraud proportion: {y.mean():.4f}")
