In [1]:
import sys
sys.path.append('../src')

In [2]:
# 1. Load necessary packages
import pandas as pd
from feature_engineering import get_feature_engineering_pipeline

In [3]:
# 2. Load your raw data
#df = pd.read_csv('../data/raw/data.csv')  # Adjust path if needed
df = pd.read_csv("../data/processed/raw_plus_risk.csv")


In [5]:
# 3. Define config for the pipeline
config = {
    'datetime_col': 'TransactionStartTime',
    'customer_id_col': 'CustomerId',
    'amount_col': 'Amount',
    'cat_cols': ['ProductCategory', 'ProviderId', 'ChannelId'],
    'num_cols': ['Amount', 'Value', 'PricingStrategy'],
    'target_col': 'is_high_risk',
    'imputation_strategy': 'mean',         # or 'median', etc.
    'scaling_method': 'standard'           # or 'minmax'
}


In [6]:
# 4. Initialize the pipeline
pipeline = get_feature_engineering_pipeline(config)




In [7]:
# 5. Fit and transform the data
# Use `FraudResult` as target if available
X_transformed = pipeline.fit_transform(df)

In [8]:
from feature_engineering import get_pipeline_feature_names
column_names = get_pipeline_feature_names(pipeline, config)



In [10]:
from feature_engineering import get_pipeline_feature_names

X_array = pipeline.transform(df)
column_names = get_pipeline_feature_names(pipeline, config)

# Create DataFrame with proper headers
X_df = pd.DataFrame(X_array, columns=column_names)

# (Optional) Attach original ID
X_df['CustomerId'] = df['CustomerId'].values

# Save to CSV
X_df.to_csv("../data/processed/X_transformed_named.csv", index=False)


In [11]:
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

In [12]:
# 6. Inspect the transformed output
print(X_transformed_df.shape)
X_transformed_df = pd.DataFrame(X_transformed_df)
X_transformed_df.head()

(95662, 48)


Unnamed: 0,num__Amount,num__Value,num__PricingStrategy,cat__ProductCategory_airtime,cat__ProductCategory_data_bundles,cat__ProductCategory_financial_services,cat__ProductCategory_movies,cat__ProductCategory_other,cat__ProductCategory_ticket,cat__ProductCategory_transport,...,remainder__transaction_is_weekend,remainder__total_amount,remainder__avg_amount,remainder__transaction_count,remainder__amount_std,remainder__amount_min,remainder__amount_max,remainder__ProductCategory_woe,remainder__ProviderId_woe,remainder__ChannelId_woe
0,-0.046371,-0.072291,-0.349252,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,109921.75,923.712185,119,3042.294251,-5000.0,20000.0,-0.15115,-0.16638,0.021516
1,-0.054643,-0.080251,-0.349252,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,109921.75,923.712185,119,3042.294251,-5000.0,20000.0,0.113799,-0.061042,-0.054063
2,-0.050426,-0.076352,-0.349252,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1000.0,500.0,2,0.0,500.0,500.0,-0.15115,-0.16638,0.021516
3,0.107717,0.096648,-0.349252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,228727.2,6019.136842,38,17169.24161,-10000.0,100000.0,-0.253605,0.561762,0.021516
4,-0.059704,-0.075183,-0.349252,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,228727.2,6019.136842,38,17169.24161,-10000.0,100000.0,0.113799,-0.061042,-0.054063


In [13]:
from feature_engineering import save_pipeline, load_pipeline

# Save
save_pipeline(pipeline, '../data/processed/feature_pipeline.pkl')


# Load later
loaded_pipeline = load_pipeline('../data/processed/feature_pipeline.pkl')


In [14]:
X_transformed_df.to_csv('../data/processed/transformed_features2.csv', index=False)
print("✅ Transformed features saved to: ../data/processed/transformed_features_2.csv")
print(f"Shape: {X_transformed_df.shape}")


✅ Transformed features saved to: ../data/processed/transformed_features_2.csv
Shape: (95662, 48)
