In [1]:
import numpy as np
import pandas as pd

## Load Data

In [3]:
train_data = pd.read_csv("../data/train_data.csv")
train_data.head()

Unnamed: 0,MonthlyHour,Type,Amount,SenderBalance,ReceiverBalance,isFraud
0,18,TRANSFER,185179.51,0.0,666160.16,0
1,185,DEBIT,18996.03,534.0,218943.42,0
2,306,PAYMENT,1249.91,101240.78,0.0,0
3,227,CASH_OUT,80870.17,0.0,98974.08,0
4,34,CASH_OUT,30917.39,30917.39,339926.42,1


In [53]:
X,y = train_data.drop("isFraud",axis=1),train_data.isFraud

## Clean Data

Data is already cleaned, but **Cleaning Pipeline** is necessary. 

In [5]:
from sklearn.impute import SimpleImputer

In [104]:
# Fill Monthly Hour and Type with Mode
mode_imputer = SimpleImputer(strategy="most_frequent")
#mode_imputer.fit(X[["MonthlyHour","Type"]].values.reshape((-1,1)))

In [105]:
# Fill Amount, Sender Balance and Receiver Balance with Mean
money_imputer = SimpleImputer(strategy="mean")
#money_imputer.fit(X[["Amount","SenderBalance","ReceiverBalance"]])

## Feature Engineering

In [153]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        eps = 1e-8
        if isinstance(X, pd.DataFrame):
            data = X.copy()
        else:
            data = pd.DataFrame(X)
            data.columns = ['MonthlyHour', 'Type', 'Amount', 'SenderBalance', 'ReceiverBalance']
        
        data['Hour'] = data['MonthlyHour'] % 24
        data['Day'] = (data['MonthlyHour'] // 24) % 30
        data['InWeekend'] = data['Day'].isin([5, 6, 12, 13, 19, 20, 26, 27])
        data['AmountCategory'] = pd.cut(data['Amount'], bins=[-1, 100, 1000, 10_000, 100_000, np.inf], labels=['Small', 'Medium', 'Large', 'Very Large', 'Extreme Large'])
        data['IsHighAmount'] = data['Amount'] > data['Amount'].quantile(0.99)
        data['BalanceDiff'] = data['SenderBalance'] - data['ReceiverBalance']
        
        return data

X_engineered = feature_engineering(X)

In [84]:
X_engineered.head()

Unnamed: 0,MonthlyHour,Type,Amount,SenderBalance,ReceiverBalance,Hour,Day,InWeekend,AmountCategory,IsHighAmount,NormalizedAmount,BalanceDiff
0,18,TRANSFER,185179.51,0.0,666160.16,18,0,False,Extreme Large,False,0.0,-666160.16
1,185,DEBIT,18996.03,534.0,218943.42,17,7,False,Very Large,False,35.57309,-218409.42
2,306,PAYMENT,1249.91,101240.78,0.0,18,12,True,Large,False,0.012346,101240.78
3,227,CASH_OUT,80870.17,0.0,98974.08,11,9,False,Very Large,False,0.0,-98974.08
4,34,CASH_OUT,30917.39,30917.39,339926.42,10,1,False,Very Large,False,1.0,-309009.03


## Feature Scaling

In [50]:
from sklearn.preprocessing import StandardScaler

In [131]:
scaler = StandardScaler()
#scaler.fit(X_engineered.select_dtypes(np.number))

## Transform Categorical Features

In [61]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

In [133]:
one_hot_enc = OneHotEncoder()
#one_hot_enc.fit(X_engineered["Type"].values.reshape((-1,1)))

In [134]:
custom_categories = [
    [False, True],  # For InWeekend
    [False, True],  # For IsHighAmount
    ['Small', 'Medium', 'Large', 'Very Large', 'Extreme Large']  # For AmountCategory
]
ord_enc = OrdinalEncoder(categories=custom_categories)
#ord_enc.fit(X_engineered[["InWeekend","IsHighAmount","AmountCategory"]])

## Build Full Pipeline

In [147]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [106]:
cleaning_pipeline = ColumnTransformer(
    transformers=[
        ("mode_imputing", mode_imputer, ["MonthlyHour", "Type"]),
        ("mean_imputing", money_imputer, ["Amount", "SenderBalance", "ReceiverBalance"]),
    ],
    remainder="passthrough"
)
cleaning_pipeline

In [113]:
cleaning_pipeline.fit_transform(X)

array([[18, 'TRANSFER', 185179.51, 0.0, 666160.16],
       [185, 'DEBIT', 18996.03, 534.0, 218943.42],
       [306, 'PAYMENT', 1249.91, 101240.78, 0.0],
       ...,
       [490, 'CASH_OUT', 1048388.34, 1048388.34, 0.0],
       [281, 'CASH_OUT', 54235.7, 54235.7, 0.0],
       [605, 'CASH_OUT', 1555182.41, 1555182.41, 76405.76]], dtype=object)

In [136]:
categorical_transforming_pipeline = ColumnTransformer(
    transformers=[
        ("one_hot_enc",one_hot_enc,["Type"]),
        ("ordinal_enc",ord_enc,["InWeekend","IsHighAmount","AmountCategory"])
    ],
    remainder="passthrough"
)
categorical_transforming_pipeline

In [140]:
transforming_pipeline = ColumnTransformer(
    transformers=[
        ("scaling",scaler,["MonthlyHour","Amount", "SenderBalance", "ReceiverBalance","Hour","Day","BalanceDiff"]),
        ("encoding",categorical_transforming_pipeline,["Type","InWeekend","AmountCategory","IsHighAmount"])
    ],
    remainder="passthrough"
)

In [150]:
pipeline = Pipeline(
    steps=[
        ("cleaning",cleaning_pipeline),
        ("feature_engineering",feature_engineering),
        ("transforming",transforming_pipeline)
    ]
)
pipeline

In [143]:
import pickle

# Save the pipeline to a file
with open('pipeline.pkl', 'wb') as f:
    pickle.dump(transforming_pipeline, f)

In [152]:
pipeline.transform(pd.read_csv("../data/test_data.csv"))

AttributeError: 'function' object has no attribute 'transform'