### Feature Engineering and Data Transformation

#### Fraud_Data.csv and Creditcard.csv

####  Import libraries

In [36]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import resample

import os


## PART A — FRAUD DATASET

### Load fraud Data

In [37]:
fraud_path = r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed\fraud_cleaned.csv"
fraud_df = pd.read_csv(fraud_path)

fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

fraud_df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,country,time_since_signup,hour_of_day,day_of_week,user_transaction_count,time_diff_hours,avg_time_between_tx
0,2,2015-01-11 03:47:13,2015-02-21 10:03:37,54,FGBQNDNBETFJJ,SEO,Chrome,F,25,880217500.0,0,,Unknown,990.273333,10,5,1,,
1,4,2015-06-02 16:40:57,2015-09-26 21:32:16,41,MKFUIVOHLJBYN,Direct,Safari,F,38,2785906000.0,0,,Unknown,2788.855278,21,5,1,,
2,8,2015-05-28 07:53:06,2015-08-13 11:53:07,47,SCQGQALXBUQZJ,SEO,Chrome,M,25,356056700.0,0,,Unknown,1852.000278,11,3,1,,
3,9,2015-05-16 15:58:32,2015-05-20 23:06:42,62,IEZOHXPZBIRTE,SEO,FireFox,M,21,759104700.0,0,,Unknown,103.136111,23,2,1,,
4,12,2015-01-10 06:25:12,2015-03-04 20:56:37,35,MSNWCFEHKTIOY,Ads,Safari,M,19,2985180000.0,0,,Unknown,1286.523611,20,2,1,,


### Feature Engineering (Fraud)

#### Time-based features

In [38]:
fraud_df['time_since_signup'] = (
    fraud_df['purchase_time'] - fraud_df['signup_time']
).dt.total_seconds() / 3600  # hours

fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek


#### Separate Features & Target

In [39]:
X_fraud = fraud_df.drop(columns=[
    'class',
    'user_id',
    'device_id',
    'signup_time',
    'purchase_time',
    'ip_address',
    'ip_int'
])

y_fraud = fraud_df['class']


#### Train/Test Split (Fraud)

In [40]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_fraud,
    y_fraud,
    test_size=0.3,
    random_state=42,
    stratify=y_fraud
)


#### Scale Numerical Features (Fraud)

In [41]:
numerical_features_fraud = [
    'purchase_value',
    'age',
    'time_since_signup',
    'hour_of_day',
    'day_of_week'
]

scaler_fraud = StandardScaler()

Xf_train_num = scaler_fraud.fit_transform(Xf_train[numerical_features_fraud])
Xf_test_num = scaler_fraud.transform(Xf_test[numerical_features_fraud])


#### Encode Categorical Features (Fraud)

In [42]:
categorical_features_fraud = ['source', 'browser', 'sex', 'country']

ohe_fraud = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore'
)

Xf_train_cat = ohe_fraud.fit_transform(Xf_train[categorical_features_fraud])
Xf_test_cat = ohe_fraud.transform(Xf_test[categorical_features_fraud])


### Combine Final Fraud Features

In [43]:
Xf_train_final = np.hstack([Xf_train_num, Xf_train_cat])
Xf_test_final = np.hstack([Xf_test_num, Xf_test_cat])

yf_train = yf_train.values
yf_test = yf_test.values


#### Handle Class Imbalance (Fraud – Undersampling)

In [44]:
train_data = np.hstack([Xf_train_final, yf_train.reshape(-1, 1)])

majority = train_data[train_data[:, -1] == 0]
minority = train_data[train_data[:, -1] == 1]

majority_downsampled = resample(
    majority,
    replace=False,
    n_samples=len(minority),
    random_state=42
)

fraud_train_resampled = np.vstack([majority_downsampled, minority])
np.random.shuffle(fraud_train_resampled)

Xf_train_resampled = fraud_train_resampled[:, :-1]
yf_train_resampled = fraud_train_resampled[:, -1]


### Class distribution

In [45]:
print("Before resampling:", np.bincount(yf_train.astype(int)))
print("After resampling:", np.bincount(yf_train_resampled.astype(int)))


Before resampling: [95872  9906]
After resampling: [9906 9906]


#### Save Fraud Feature-Engineered Data

In [48]:
os.makedirs(r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed", exist_ok=True)

fraud_train_df = pd.DataFrame(Xf_train_resampled)
fraud_train_df['class'] = yf_train_resampled

fraud_test_df = pd.DataFrame(Xf_test_final)
fraud_test_df['class'] = yf_test

fraud_train_df.to_csv(r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed/fraud_train_fe.csv", index=False)
fraud_test_df.to_csv(r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed/fraud_test_fe.csv", index=False)


## PART B — CREDIT CARD DATASET

### Load Credit Card Data

In [49]:
cc_path = r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\raw\creditcard.csv"
cc_df = pd.read_csv(cc_path)

cc_df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Feature Engineering (Credit Card)

#### Time-based feature

In [50]:
cc_df['hour'] = (cc_df['Time'] / 3600) % 24


### Separate Features & Target

In [51]:
X_cc = cc_df.drop(columns=['Class'])
y_cc = cc_df['Class']


### Train/Test Split (Credit Card)

In [52]:
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_cc,
    y_cc,
    test_size=0.3,
    random_state=42,
    stratify=y_cc
)


### Scale Features (Credit Card)

In [53]:
scaler_cc = StandardScaler()

Xc_train_scaled = scaler_cc.fit_transform(Xc_train)
Xc_test_scaled = scaler_cc.transform(Xc_test)


### Check Class Distribution (Before)

In [54]:
print("Original class distribution (training set):")
print(np.bincount(yc_train.astype(int)))


Original class distribution (training set):
[199020    344]


### Apply Undersampling (Training Data Only)

In [56]:
# Combine X and y for resampling
cc_train_data = np.hstack([
    Xc_train_scaled,
    yc_train.values.reshape(-1, 1)
])

# Separate majority and minority classes
cc_majority = cc_train_data[cc_train_data[:, -1] == 0]
cc_minority = cc_train_data[cc_train_data[:, -1] == 1]

# Downsample majority class
cc_majority_downsampled = resample(
    cc_majority,
    replace=False,
    n_samples=len(cc_minority),
    random_state=42
)

# Combine and shuffle
cc_train_resampled = np.vstack([cc_majority_downsampled, cc_minority])
np.random.shuffle(cc_train_resampled)

# Split features and target
Xc_train_resampled = cc_train_resampled[:, :-1]
yc_train_resampled = cc_train_resampled[:, -1]


### Class Distribution (After)

In [57]:
print("Resampled class distribution:")
print(np.bincount(yc_train_resampled.astype(int)))


Resampled class distribution:
[344 344]


### Save Feature-Engineered Credit Card Data

In [59]:
cc_train_fe = pd.DataFrame(
    Xc_train_resampled,
    columns=Xc_train.columns
)
cc_train_fe['Class'] = yc_train_resampled

cc_test_fe = pd.DataFrame(
    Xc_test_scaled,
    columns=Xc_test.columns
)
cc_test_fe['Class'] = yc_test.values

cc_train_fe.to_csv(r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed/creditcard_train_fe.csv", index=False)
cc_test_fe.to_csv(r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed/creditcard_test_fe.csv", index=False)


### Justification for Using Undersampling:

- The original dataset is heavily imbalanced (Class 0 >> Class 1).
- Undersampling reduces the number of majority class samples to match the minority class.
- This prevents the model from being biased toward the majority class.
- Alternative methods like SMOTE can create synthetic samples, but:
    - SMOTE may introduce noise in high-dimensional data.
    - Undersampling is simpler and ensures all training data points are real.
- Therefore, undersampling is chosen for this task to balance the classes in the training dataset only.

