# Task 1: Feature Engineering and Transformation
This notebook demonstrates the feature engineering, scaling, and handling of class imbalance.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append(os.path.abspath('..'))
from src.data_preprocessing import load_data, clean_data, feature_engineer_fraud, transform_data, handle_imbalance

In [None]:
fraud_path = '../data/raw/Fraud_Data.csv'
ip_path = '../data/raw/IpAddress_to_Country.csv'
credit_path = '../data/raw/creditcard.csv'

fraud_df, ip_df, credit_df = load_data(fraud_path, ip_path, credit_path)
fraud_df = clean_data(fraud_df)
fraud_df = feature_engineer_fraud(fraud_df, ip_df)

## 1. Feature Engineering Distribution
Checking the new features like `device_id_count` and `ip_address_count`.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(fraud_df['device_id_count'], bins=20)
plt.title('Distribution of Device ID Counts')
plt.show()

## 2. Data Transformation
Scaling numerical and encoding categorical features.

In [None]:
num_cols = ['purchase_value', 'age', 'hour_of_day', 'time_since_signup', 'device_id_count', 'ip_address_count']
cat_cols = ['source', 'browser', 'sex']

transformed_df, preprocessor = transform_data(fraud_df, cat_cols, num_cols)
print(f"Transformed shape: {transformed_df.shape}")
transformed_df.head()

## 3. Handling Class Imbalance
Comparing class distribution before and after SMOTE.

In [None]:
X = transformed_df.drop(['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address', 'class', 'country'], axis=1, errors='ignore')
X = X[X.columns[X.columns.isin(transformed_df.columns)]] # ensure only transformed cols
# Actually transform_data returns all cols. Let's fix that in logic if needed.

y = fraud_df['class']

print("Before SMOTE:")
print(y.value_counts())

X_res, y_res = handle_imbalance(X, y, strategy='smote')

print("\nAfter SMOTE:")
print(pd.Series(y_res).value_counts())