In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [5]:
# Load the dataset
df = pd.read_csv("../data/processed/eda_data.csv")

# Feature Engineering
agg_df = df.groupby('CustomerId').agg({
    'Amount': ['sum', 'mean', 'std', 'max', 'min', 'count'],
    'Value': ['sum', 'mean', 'std', 'max', 'min'],
    'TransactionHour': 'nunique',
    'TransactionDay': 'nunique',
    'TransactionMonth': 'nunique'
}).reset_index()

# Flatten column names
agg_df.columns = ['CustomerId'] + ['_'.join(col).strip() for col in agg_df.columns[1:]]
customer_ids = agg_df['CustomerId']

num_features = [col for col in agg_df.columns if agg_df[col].dtype in ['int64', 'float64'] and col != 'CustomerId']


X = agg_df.drop(columns=['CustomerId'])
y = np.zeros(X.shape[0]) 

print("X columns:", X.columns.tolist())

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Apply transformations
X_scaled = numeric_pipeline.fit_transform(X)

print("Transformed X shape:", X_scaled.shape)

print("Feature extraction complete. Processed data ready for modeling.")

X columns: ['Amount_sum', 'Amount_mean', 'Amount_std', 'Amount_max', 'Amount_min', 'Amount_count', 'Value_sum', 'Value_mean', 'Value_std', 'Value_max', 'Value_min', 'TransactionHour_nunique', 'TransactionDay_nunique', 'TransactionMonth_nunique']
Transformed X shape: (3742, 14)
Feature extraction complete. Processed data ready for modeling.
