# Feature Engineering

In [1]:
# Load datasets
import pandas as pd

fraud_df = pd.read_csv('../data/processed/fraud_with_country.csv')


In [3]:
fraud_df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443,0,Unknown
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,0,United States


🔹 1. Extract hour_of_day & day_of_week

In [4]:
# format purchase_time as datetime
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'], format='%Y-%m-%d %H:%M:%S')

In [7]:
# format signnup_time as datetime
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'], format='%Y-%m-%d %H:%M:%S')

In [5]:
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek  # Monday = 0

🔹 2. Calculate time_since_signup

In [8]:
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

🔹 3. Transaction count per user

In [9]:
fraud_df['user_txn_count'] = fraud_df.groupby('user_id')['user_id'].transform('count')

🔹 4. Unique devices per user

In [10]:
device_counts = fraud_df.groupby('user_id')['device_id'].nunique().reset_index(name='user_device_count')
fraud_df = fraud_df.merge(device_counts, on='user_id', how='left')

💾 Save new version

In [11]:
fraud_df.to_csv('../data/processed/fraud_features.csv', index=False)

In [12]:
fraud_df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,hour_of_day,day_of_week,time_since_signup,user_txn_count,user_device_count
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368,0,Japan,2,5,4506682.0,1,1
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387,0,United States,1,0,17944.0,1,1
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,1,United States,18,3,1.0,1,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443,0,Unknown,13,0,492085.0,1,1
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,0,United States,18,2,4361461.0,1,1


### Data Transformation

In [1]:
# 📌 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

In [2]:
# 📌 2. Load data
fraud_df = pd.read_csv('../data/processed/fraud_features.csv')

In [3]:
# 📌 3. Separate features and target
X = fraud_df.drop(columns=['class'])
y = fraud_df['class']

In [32]:
# 📌 4. Train-test split (Important: stratify by class)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [33]:
# Check class balance before SMOTE
print("Before SMOTE:", y_train.value_counts(normalize=True))

Before SMOTE: class
0    0.906352
1    0.093648
Name: proportion, dtype: float64


In [None]:
# 📌 5. Apply SMOTE to training data
from imblearn.over_sampling import SMOTE

# Separate columns
categorical_cols = ['source', 'browser', 'sex', 'country', 'hour_of_day', 'day_of_week']
numeric_cols = ['purchase_value', 'age', 'time_since_signup', 'user_txn_count', 'user_device_count']

# Reset indices to avoid mismatch
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

X_train_num = X_train[numeric_cols]
X_train_cat = X_train[categorical_cols]

# Apply SMOTE to numeric features only
smote = SMOTE(random_state=42)
X_train_num_resampled, y_train_resampled = smote.fit_resample(X_train_num, y_train)

# Figure out how many new rows were added
num_original = len(X_train)
num_resampled = len(X_train_num_resampled)
num_synthetic = num_resampled - num_original

# Duplicate categorical rows accordingly
X_train_cat_resampled = pd.concat([
    X_train_cat,  # original
    X_train_cat.sample(n=num_synthetic, replace=True, random_state=42)  # synthetic
], ignore_index=True)

# Merge numeric + categorical
X_train_resampled = pd.concat([X_train_num_resampled, X_train_cat_resampled], axis=1)


In [37]:
# 📌 6. Scale numeric columns
numeric_cols = ['purchase_value', 'age', 'time_since_signup', 'user_txn_count', 'user_device_count']
scaler = StandardScaler()

X_train_resampled[numeric_cols] = scaler.fit_transform(X_train_resampled[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [38]:
# 📌 7. Encode categorical features
categorical_cols = ['source', 'browser', 'sex', 'country', 'hour_of_day', 'day_of_week']
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit encoder on train
encoded_train = encoder.fit_transform(X_train_resampled[categorical_cols])
encoded_test = encoder.transform(X_test[categorical_cols])



In [39]:
# 📌 8. Combine encoded features with numeric ones
encoded_col_names = encoder.get_feature_names_out(categorical_cols)

X_train_final = pd.concat([
    X_train_resampled.drop(columns=categorical_cols).reset_index(drop=True),
    pd.DataFrame(encoded_train, columns=encoded_col_names)
], axis=1)

X_test_final = pd.concat([
    X_test.drop(columns=categorical_cols).reset_index(drop=True),
    pd.DataFrame(encoded_test, columns=encoded_col_names)
], axis=1)

In [40]:
# 📌 9. Save final datasets
X_train_final.to_csv('../data/processed/X_train_final.csv', index=False)
X_test_final.to_csv('../data/processed/X_test_final.csv', index=False)
y_train_resampled.to_csv('../data/processed/y_train_final.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

# Optional: Save the scaler and encoder for reuse
joblib.dump(scaler, '../data/processed/scaler.pkl')
joblib.dump(encoder, '../data/processed/encoder.pkl')

print("✅ Feature engineering and data transformation complete!")


✅ Feature engineering and data transformation complete!
