In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))

In [2]:
import pandas as pd
from src.data_processing import process_data

In [7]:
df_raw = pd.read_csv("../Data/data.csv")
df_raw.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [2]:
class DateTimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    """Extracts time-based features from TransactionStartTime"""

    def __init__(self, datetime_col="TransactionStartTime"):
        self.datetime_col = datetime_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.datetime_col] = pd.to_datetime(X[self.datetime_col])

        X["transaction_hour"] = X[self.datetime_col].dt.hour
        X["transaction_day"] = X[self.datetime_col].dt.day
        X["transaction_month"] = X[self.datetime_col].dt.month
        X["transaction_year"] = X[self.datetime_col].dt.year

        return X.drop(columns=[self.datetime_col])

In [3]:
class CustomerAggregator(BaseEstimator, TransformerMixin):
    """Aggregates transaction-level data to customer-level"""

    def __init__(self, customer_id="CustomerId"):
        self.customer_id = customer_id
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        agg_df = (
            X.groupby(self.customer_id)
            .agg(
                total_amount=("Amount", "sum"),
                avg_amount=("Amount", "mean"),
                transaction_count=("TransactionId", "count"),
                std_amount=("Amount", "std"),
                transaction_hour=("transaction_hour", "mean"),
                transaction_day=("transaction_day", "mean"),
                transaction_month=("transaction_month", "mean"),
            )
            .reset_index()
        )
        agg_df["std_amount"] = agg_df["std_amount"].fillna(0)

        return agg_df


In [4]:
def build_feature_pipeline():

    numerical_features = [
        "total_amount",
        "avg_amount",
        "transaction_count",
        "std_amount",
        "transaction_hour",
        "transaction_day",
        "transaction_month",
    ]

    numerical_pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_pipeline, numerical_features),
        ],
        remainder="drop",
    )

    return preprocessor


In [5]:
def process_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Full feature engineering pipeline
    """

    df = df.copy()
    drop_cols = [
        "TransactionId",
        "BatchId",
        "SubscriptionId",
        "AccountId",
        "ProductId",
        "FraudResult",
    ]
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)
    datetime_extractor = DateTimeFeaturesExtractor()
    df = datetime_extractor.fit_transform(df)
    aggregator = CustomerAggregator()
    df_agg = aggregator.fit_transform(df)

    pipeline = build_feature_pipeline()
    features = pipeline.fit_transform(df_agg)

    feature_names = pipeline.get_feature_names_out()

    processed_df = pd.DataFrame(features, columns=feature_names)
    processed_df["CustomerId"] = df_agg["CustomerId"].values

    return processed_df

In [9]:
processed_df = process_data(df_raw)
processed_df.head()

Unnamed: 0,total_amount,avg_amount,transaction_count,std_amount,avg_transaction_hour,avg_transaction_day,avg_transaction_month,CustomerId
0,-0.066891,-0.153364,-0.253459,-0.140432,0.883284,0.770545,1.038381,CustomerId_1
1,-0.066891,-0.153364,-0.253459,-0.140432,0.883284,0.770545,1.038381,CustomerId_10
2,-0.055849,-0.06987,-0.212186,-0.072731,-1.222654,0.050136,1.038381,CustomerId_1001
3,-0.061655,-0.091435,-0.150278,-0.134647,0.229556,-0.133241,0.523448,CustomerId_1002
4,-0.055849,-0.073846,-0.201868,-0.078186,0.455248,-2.111093,-0.922326,CustomerId_1003


In [10]:
processed_df.to_csv("../Data/processed_features.csv", index=False)