In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

## Feature Selection

Important features for our objective:

### Essential Features:
1. **Transaction_Amount** - Fraud often involves unusual amounts
2. **Account_Balance** - Ratio of transaction to balance might be important
3. **Device_Type** - Certain devices may be more risky
4. **Location** - Unusual locations may indicate fraud
5. **Merchant_Category** - Some categories are higher risk
6. **IP_Address_Flag** - Direct indicator of suspicious activity
7. **Daily_Transaction_Count** - Unusually high activity may indicate fraud
8. **Avg_Transaction_Amount_7d** - Deviation from normal behavior
9. **Failed_Transaction_Count_7d** - Multiple failures may indicate fraud attempts
10. **Transaction_Distance** - Distance from user's usual location
11. **Authentication_Method** - Some methods are more secure
12. **Risk_Score** - Pre-calculated risk metric
13. **Is_Weekend** - Fraud patterns may differ on weekends

### Features to potentially drop:
- **Transaction_ID** - Just an identifier
- **User_ID** - May not be useful unless doing user-specific analysis
- **Timestamp** - Could extract features from it (hour, day of week)
- **Card_Type** - May not be strongly correlated with fraud
- **Card_Age** - Less likely to be predictive

### Extract time-based features

In [2]:
df = pd.read_csv('../assets/synthetic_fraud_dataset.csv')

In [3]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['Is_Night'] = ((df['Hour'] >= 22) | (df['Hour'] <= 6)).astype(int)
df['Amount_to_Balance_Ratio'] = df['Transaction_Amount'] / df['Account_Balance']
df['Amount_Deviation'] = df['Transaction_Amount'] - df['Avg_Transaction_Amount_7d']
high_risk_categories = ['Travel', 'Electronics', 'Clothing']
df['High_Risk_Category'] = df['Merchant_Category'].isin(high_risk_categories).astype(int)

In [4]:
df

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label,Hour,DayOfWeek,Is_Night,Amount_to_Balance_Ratio,Amount_Deviation,High_Risk_Category
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,Biometric,0.8494,0,0,19,0,0,0.000427,-397.84,1
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,Password,0.0959,0,1,4,2,1,0.000016,-477.57,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,Biometric,0.8400,0,1,15,1,0,0.018226,-21.05,0
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.20,Tablet,New York,Clothing,0,...,OTP,0.7935,0,1,0,3,1,0.003311,71.84,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,Password,0.3819,1,1,23,5,1,0.000339,-297.41,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,TXN_11284,USER_4796,45.05,Online,2023-01-29 18:38:00,76960.11,Mobile,Tokyo,Clothing,0,...,PIN,0.1493,1,0,18,6,0,0.000585,-343.95,1
49996,TXN_44732,USER_1171,126.15,POS,2023-05-09 08:55:00,28791.75,Mobile,Tokyo,Clothing,0,...,Biometric,0.3653,0,1,8,1,0,0.004381,-308.80,1
49997,TXN_38158,USER_2510,72.02,Online,2023-01-30 19:32:00,29916.41,Laptop,Mumbai,Clothing,0,...,Biometric,0.5195,0,0,19,0,0,0.002407,-297.13,1
49998,TXN_860,USER_2248,64.89,Bank Transfer,2023-03-09 19:47:00,67895.67,Mobile,Tokyo,Electronics,0,...,Biometric,0.7063,0,1,19,3,0,0.000956,-177.40,1


### Drop columns - including all categorical ones

In [5]:
columns_to_drop = [
    'Transaction_ID', 'User_ID', 'Timestamp', 'Card_Type', 'Card_Age',
    'Transaction_Type', 'Device_Type', 'Location', 
    'Merchant_Category', 'Authentication_Method'
]
df = df.drop(columns_to_drop, axis=1)

### Verify remaining columns

In [6]:
print("Remaining numerical features:")
print(df.columns.tolist())

Remaining numerical features:
['Transaction_Amount', 'Account_Balance', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Transaction_Distance', 'Risk_Score', 'Is_Weekend', 'Fraud_Label', 'Hour', 'DayOfWeek', 'Is_Night', 'Amount_to_Balance_Ratio', 'Amount_Deviation', 'High_Risk_Category']


### Prepare X and y

In [7]:
X = df.drop('Fraud_Label', axis=1)
y = df['Fraud_Label']

#### Since we're only using numerical features now, we just need StandardScaler

In [8]:
preprocessor = StandardScaler()

### Train-test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### Scale the data

In [10]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

### Convert back to DataFrames for better readability

In [11]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

### Save the preprocessed dataset

In [12]:
preprocessed_df = pd.concat([
    pd.DataFrame(X_train_scaled, columns=X.columns).assign(Fraud_Label=y_train.values),
    pd.DataFrame(X_test_scaled, columns=X.columns).assign(Fraud_Label=y_test.values)
])
preprocessed_df.to_csv('../assets/preprocessed_fraud_dataset_numerical_only.csv', index=False)

print("\nPreprocessed dataset with only numerical features saved successfully.")
print("Final features used:", X.columns.tolist())


Preprocessed dataset with only numerical features saved successfully.
Final features used: ['Transaction_Amount', 'Account_Balance', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Transaction_Distance', 'Risk_Score', 'Is_Weekend', 'Hour', 'DayOfWeek', 'Is_Night', 'Amount_to_Balance_Ratio', 'Amount_Deviation', 'High_Risk_Category']
