In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

### Extract time-based features

In [11]:
df = pd.read_csv('../assets/synthetic_fraud_dataset.csv')

In [12]:
count_values = df['Fraud_Label'].value_counts()

print(count_values)

Fraud_Label
0    33933
1    16067
Name: count, dtype: int64


In [13]:
df.columns

Index(['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type',
       'Timestamp', 'Account_Balance', 'Device_Type', 'Location',
       'Merchant_Category', 'IP_Address_Flag', 'Previous_Fraudulent_Activity',
       'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d',
       'Failed_Transaction_Count_7d', 'Card_Type', 'Card_Age',
       'Transaction_Distance', 'Authentication_Method', 'Risk_Score',
       'Is_Weekend', 'Fraud_Label'],
      dtype='object')

## Feature Selection

In building an effective fraud detection model, choosing the right features is crucial to optimize performance while avoiding unnecessary noise. Below is a breakdown of the features selected for inclusion and those considered for removal based on their relevance, redundancy, or low predictive power.

### **Important Features for Our Objective**

These features are expected to have a strong correlation with fraudulent behavior and are retained for model training:

1. **Account_Balance** – Reflects the available balance; useful when combined with transaction data.
2. **IP_Address_Flag** – Flags whether the IP address is suspicious or unusual, a strong fraud indicator.
3. **Previous_Fraudulent_Activity** – Historical behavior is often predictive of future actions.
4. **Daily_Transaction_Count** – Higher frequency may signal unusual activity.
5. **Avg_Transaction_Amount_7d** – Rolling average helps identify deviations from normal behavior.
6. **Failed_Transaction_Count_7d** – Repeated failures could indicate fraud attempts.
7. **Transaction_Distance** – Geographic distance between user and merchant; anomalies here are red flags.
8. **Risk_Score** – Composite score derived from various risk factors; likely very predictive.
9. **Is_Weekend** – Fraudulent activities might spike during weekends.
10. **Fraud_Label** – Target variable for supervised learning.
11. **Hour** – Time of day may correlate with fraudulent activity patterns.
12. **DayOfWeek** – Some days may have higher fraud rates (e.g., Mondays or Fridays).
13. **Is_Night** – Transactions at night may carry different fraud risk profiles.
14. **Amount_to_Balance_Ratio** – Indicates how much of the available balance was used; unusually high ratios are suspicious.
15. **Amount_Deviation** – Captures deviation from a user's normal transaction pattern.

---

### **Features to Potentially Drop**

These features may be removed or engineered into more meaningful ones due to low predictive value, redundancy, or their role as identifiers:

- **Transaction_ID** – Purely an identifier; no predictive power.
- **User_ID** – Only useful for user-specific analysis; may introduce data leakage.
- **Timestamp** – Raw timestamp is not useful, but derivatives like `Hour`, `DayOfWeek`, and `Is_Night` are extracted instead.
- **Card_Type** – Generally shows low correlation with fraud unless enriched with more context.
- **Card_Age** – Could be redundant or only weakly predictive.
- **Transaction_Type** – May not significantly vary in terms of fraud likelihood.
- **Device_Type** – May add noise unless clearly tied to fraud patterns.
- **Location** – Raw location may be too granular; better used to calculate `Transaction_Distance`.
- **Merchant_Category** – Depending on encoding and granularity, may or may not add value.
- **Authentication_Method** – May be redundant or weakly correlated with fraud.
- **Is_Weekend** – Already included in the selected features (listed above); avoid duplication.

In [14]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['Is_Night'] = ((df['Hour'] >= 22) | (df['Hour'] <= 6)).astype(int)
df['Amount_to_Balance_Ratio'] = df['Transaction_Amount'] / df['Account_Balance']
df['Amount_Deviation'] = df['Transaction_Amount'] - df['Avg_Transaction_Amount_7d']

In [15]:
df

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label,Hour,DayOfWeek,Is_Night,Amount_to_Balance_Ratio,Amount_Deviation
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,883.17,Biometric,0.8494,0,0,19,0,0,0.000427,-397.84
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,2203.36,Password,0.0959,0,1,4,2,1,0.000016,-477.57
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,1909.29,Biometric,0.8400,0,1,15,1,0,0.018226,-21.05
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.20,Tablet,New York,Clothing,0,...,1311.86,OTP,0.7935,0,1,0,3,1,0.003311,71.84
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,966.98,Password,0.3819,1,1,23,5,1,0.000339,-297.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,TXN_11284,USER_4796,45.05,Online,2023-01-29 18:38:00,76960.11,Mobile,Tokyo,Clothing,0,...,1537.54,PIN,0.1493,1,0,18,6,0,0.000585,-343.95
49996,TXN_44732,USER_1171,126.15,POS,2023-05-09 08:55:00,28791.75,Mobile,Tokyo,Clothing,0,...,2555.72,Biometric,0.3653,0,1,8,1,0,0.004381,-308.80
49997,TXN_38158,USER_2510,72.02,Online,2023-01-30 19:32:00,29916.41,Laptop,Mumbai,Clothing,0,...,4686.59,Biometric,0.5195,0,0,19,0,0,0.002407,-297.13
49998,TXN_860,USER_2248,64.89,Bank Transfer,2023-03-09 19:47:00,67895.67,Mobile,Tokyo,Electronics,0,...,4886.92,Biometric,0.7063,0,1,19,3,0,0.000956,-177.40


### Drop columns - including all categorical ones

In [16]:
columns_to_drop = [
    'Transaction_ID', 'User_ID', 'Timestamp', 'Card_Type', 'Card_Age',
    'Transaction_Type', 'Device_Type', 'Location', 
    'Merchant_Category', 'Authentication_Method'
]
df = df.drop(columns_to_drop, axis=1)

### Verify remaining columns

In [17]:
print("Remaining numerical features:")
print(df.columns.tolist())

Remaining numerical features:
['Transaction_Amount', 'Account_Balance', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Transaction_Distance', 'Risk_Score', 'Is_Weekend', 'Fraud_Label', 'Hour', 'DayOfWeek', 'Is_Night', 'Amount_to_Balance_Ratio', 'Amount_Deviation']


### Prepare X and y

In [18]:
X = df.drop('Fraud_Label', axis=1)
y = df['Fraud_Label']

#### Since we're only using numerical features now, we just need StandardScaler

In [19]:
preprocessor = StandardScaler()

### Train-test split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### Scale the data

In [21]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

### Convert back to DataFrames for better readability

In [22]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

### Save the preprocessed dataset

In [23]:
preprocessed_df = pd.concat([
    pd.DataFrame(X_train_scaled, columns=X.columns).assign(Fraud_Label=y_train.values),
    pd.DataFrame(X_test_scaled, columns=X.columns).assign(Fraud_Label=y_test.values)
])
preprocessed_df.to_csv('../assets/preprocessed_fraud_dataset_numerical_only.csv', index=False)

print("\nPreprocessed dataset with only numerical features saved successfully.")
print("Final features used:", X.columns.tolist())


Preprocessed dataset with only numerical features saved successfully.
Final features used: ['Transaction_Amount', 'Account_Balance', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Transaction_Distance', 'Risk_Score', 'Is_Weekend', 'Hour', 'DayOfWeek', 'Is_Night', 'Amount_to_Balance_Ratio', 'Amount_Deviation']
