### Feature Engineering and Data Transformation

####  Import libraries

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split


### Load Cleaned Data

In [23]:
fraud_df = pd.read_csv(r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed\fraud_cleaned.csv")

In [24]:
# Convert datetime columns
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Create time-based features
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds() / 3600  # hours
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek  # Monday=0

In [25]:
fraud_df.to_csv(r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed\fraud_cleaned.csv", index=False)


In [26]:
fraud_df[['time_since_signup','hour_of_day','day_of_week']].head()

Unnamed: 0,time_since_signup,hour_of_day,day_of_week
0,1251.856111,2,5
1,4.984444,1,0
2,0.000278,18,3
3,136.690278,13,0
4,1211.516944,18,2


### Separate features and target

In [27]:
X = fraud_df.drop(columns=['class', 'user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address', 'ip_int'])
y = fraud_df['class']

### Train/Test Split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

### Scaling Numerical Features

In [29]:
numerical_features = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day', 'day_of_week']
scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train[numerical_features])
X_test_num = scaler.transform(X_test[numerical_features])


### Encoding Categorical Features

In [30]:
from sklearn.preprocessing import OneHotEncoder

# Update for scikit-learn 1.8+
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform
X_train_cat = ohe.fit_transform(X_train[categorical_features])
X_test_cat = ohe.transform(X_test[categorical_features])


In [31]:
# X_train_num and X_test_num should already be scaled numerical features
X_train_final = np.hstack([X_train_num, X_train_cat])
X_test_final = np.hstack([X_test_num, X_test_cat])

# Target variable
y_train = y_train.values  # make sure it's a numpy array
y_test = y_test.values

### Handle class imbalance using undersampling

In [32]:
from sklearn.utils import resample

# Combine X_train_final and y_train for resampling
train_data = np.hstack([X_train_final, y_train.reshape(-1,1)])
# Separate majority and minority classes
majority = train_data[train_data[:,-1] == 0]
minority = train_data[train_data[:,-1] == 1]

# Downsample majority
majority_downsampled = resample(majority,
                                replace=False,  # sample without replacement
                                n_samples=len(minority),  # match minority class
                                random_state=42)

# Combine minority and downsampled majority
train_resampled = np.vstack([majority_downsampled, minority])

# Shuffle the resampled dataset
np.random.shuffle(train_resampled)

### Split back into features and target

In [33]:
X_train_final_resampled = train_resampled[:,:-1]
y_train_resampled = train_resampled[:,-1]

print("Original class distribution:", np.bincount(y_train.astype(int)))
print("Resampled class distribution:", np.bincount(y_train_resampled.astype(int)))

Original class distribution: [95872  9906]
Resampled class distribution: [9906 9906]


### Save undersampled  dataset

In [34]:
import pandas as pd

# Column names for categorical features
categorical_features = ['source', 'browser', 'sex', 'country']  # make sure these match your dataset
cat_cols = ohe.get_feature_names_out(categorical_features)

# Create DataFrame for undersampled training data
X_train_undersampled_df = pd.DataFrame(
    np.hstack([X_train_final_resampled[:, :len(numerical_features)],
               X_train_final_resampled[:, len(numerical_features):]]),
    columns=[*numerical_features, *cat_cols]
)
X_train_undersampled_df['class'] = y_train_resampled

# Save undersampled training dataset
X_train_undersampled_df.to_csv(
    r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed\fraud_train_undersampled.csv",
    index=False
)

# Optionally, save the test set as well (unchanged)
X_test_df = pd.DataFrame(np.hstack([X_test_num, X_test_cat]), columns=[*numerical_features, *cat_cols])
X_test_df['class'] = y_test
X_test_df.to_csv(
    r"C:\Users\Administrator\Downloads\Week 5and6\Improved_detection_of_fraud_cases\data\processed\fraud_test.csv",
    index=False
)

print("Undersampled train and test datasets saved successfully.")


Undersampled train and test datasets saved successfully.


### Justification for Using Undersampling:

- The original dataset is heavily imbalanced (Class 0 >> Class 1).
- Undersampling reduces the number of majority class samples to match the minority class.
- This prevents the model from being biased toward the majority class.
- Alternative methods like SMOTE can create synthetic samples, but:
    - SMOTE may introduce noise in high-dimensional data.
    - Undersampling is simpler and ensures all training data points are real.
- Therefore, undersampling is chosen for this task to balance the classes in the training dataset only.

