In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
train_data = pd.read_csv("../data/train_data.csv")
train_data.head()

Unnamed: 0,MonthlyHour,Type,Amount,SenderBalance,ReceiverBalance,isFraud
0,18,TRANSFER,185179.51,0.0,666160.16,0
1,185,DEBIT,18996.03,534.0,218943.42,0
2,306,PAYMENT,1249.91,101240.78,0.0,0
3,227,CASH_OUT,80870.17,0.0,98974.08,0
4,34,CASH_OUT,30917.39,30917.39,339926.42,1


In [3]:
X,y = train_data.drop("isFraud",axis=1),train_data.isFraud

## Clean Data

Data is already cleaned, but **Cleaning Pipeline** is necessary. 

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
# Fill Monthly Hour and Type with Mode
mode_imputer = SimpleImputer(strategy="most_frequent")
#mode_imputer.fit(X[["MonthlyHour","Type"]].values.reshape((-1,1)))

In [6]:
# Fill Amount, Sender Balance and Receiver Balance with Mean
money_imputer = SimpleImputer(strategy="mean")
#money_imputer.fit(X[["Amount","SenderBalance","ReceiverBalance"]])

## Feature Engineering

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        eps = 1e-8
        if isinstance(X, pd.DataFrame):
            data = X.copy()
        else:
            data = pd.DataFrame(X)
            data.columns = ['MonthlyHour', 'Type', 'Amount', 'SenderBalance', 'ReceiverBalance']
        
        data['Hour'] = data['MonthlyHour'] % 24
        data['Day'] = (data['MonthlyHour'] // 24) % 30
        data['InWeekend'] = data['Day'].isin([5, 6, 12, 13, 19, 20, 26, 27])
        data['AmountCategory'] = pd.cut(data['Amount'], bins=[-1, 100, 1000, 10_000, 100_000, np.inf], labels=['Small', 'Medium', 'Large', 'Very Large', 'Extreme Large'])
        data['IsHighAmount'] = data['Amount'] > data['Amount'].quantile(0.99)
        data['BalanceDiff'] = data['SenderBalance'] - data['ReceiverBalance']
        
        return data

feature_engineering = FeatureEngineeringTransformer()
dump(feature_engineering,"../pipelines/feature_engineering.joblib")
X_engineered = feature_engineering.transform(X)

In [8]:
X_engineered.head()

Unnamed: 0,MonthlyHour,Type,Amount,SenderBalance,ReceiverBalance,Hour,Day,InWeekend,AmountCategory,IsHighAmount,BalanceDiff
0,18,TRANSFER,185179.51,0.0,666160.16,18,0,False,Extreme Large,False,-666160.16
1,185,DEBIT,18996.03,534.0,218943.42,17,7,False,Very Large,False,-218409.42
2,306,PAYMENT,1249.91,101240.78,0.0,18,12,True,Large,False,101240.78
3,227,CASH_OUT,80870.17,0.0,98974.08,11,9,False,Very Large,False,-98974.08
4,34,CASH_OUT,30917.39,30917.39,339926.42,10,1,False,Very Large,False,-309009.03


## Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler = StandardScaler()
#scaler.fit(X_engineered.select_dtypes(np.number))

## Transform Categorical Features

In [11]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

In [12]:
one_hot_enc = OneHotEncoder()
#one_hot_enc.fit(X_engineered["Type"].values.reshape((-1,1)))

In [13]:
custom_categories = [
    [False, True],  # For InWeekend
    [False, True],  # For IsHighAmount
    ['Small', 'Medium', 'Large', 'Very Large', 'Extreme Large']  # For AmountCategory
]
ord_enc = OrdinalEncoder(categories=custom_categories)
#ord_enc.fit(X_engineered[["InWeekend","IsHighAmount","AmountCategory"]])

## Build Full Pipeline

In [58]:
from joblib import dump
import cloudpickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [31]:
cleaning_pipeline = ColumnTransformer(
    transformers=[
        ("mode_imputing", mode_imputer, ["MonthlyHour", "Type"]),
        ("mean_imputing", money_imputer, ["Amount", "SenderBalance", "ReceiverBalance"]),
    ],
    remainder="passthrough"
)
dump(transforming_pipeline, '../pipelines/cleaning_pipeline.joblib')

cleaning_pipeline

In [16]:
cleaning_pipeline.fit_transform(X)

array([[18, 'TRANSFER', 185179.51, 0.0, 666160.16],
       [185, 'DEBIT', 18996.03, 534.0, 218943.42],
       [306, 'PAYMENT', 1249.91, 101240.78, 0.0],
       ...,
       [490, 'CASH_OUT', 1048388.34, 1048388.34, 0.0],
       [281, 'CASH_OUT', 54235.7, 54235.7, 0.0],
       [605, 'CASH_OUT', 1555182.41, 1555182.41, 76405.76]], dtype=object)

In [32]:
categorical_transforming_pipeline = ColumnTransformer(
    transformers=[
        ("one_hot_enc",one_hot_enc,["Type"]),
        ("ordinal_enc",ord_enc,["InWeekend","IsHighAmount","AmountCategory"])
    ],
    remainder="passthrough"
)
dump(categorical_transforming_pipeline,"../pipelines/categorical_transforming_pipeline.joblib")
categorical_transforming_pipeline

In [34]:
transforming_pipeline = ColumnTransformer(
    transformers=[
        ("scaling",scaler,["MonthlyHour","Amount", "SenderBalance", "ReceiverBalance","Hour","Day","BalanceDiff"]),
        ("encoding",categorical_transforming_pipeline,["Type","InWeekend","AmountCategory","IsHighAmount"])
    ],
    remainder="passthrough"
)
dump(transforming_pipeline,"../pipelines/transforming_pipeline.joblib")
transforming_pipeline

In [36]:
pipeline = Pipeline(
    steps=[
        ("cleaning",cleaning_pipeline),
        ("feature_engineering",feature_engineering),
        ("transforming",transforming_pipeline)
    ]
)
pipeline.fit_transform(X)

['../pipelines/pipeline.joblib']

In [57]:
with open("../pipelines/pipeline.pkl", "wb") as f:
    cloudpickle.dump(pipeline, f)

In [53]:
# Test Pipeline with Test Data
pipeline.transform(pd.read_csv("../data/test_data.csv")).shape

(8400, 15)

In [50]:
# Save Processed Training data
train_data_processed = pd.DataFrame(pipeline.transform(X))
train_data_processed["target"] = y
train_data_processed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,target
0,-1.481411,-0.262831,-0.335514,-0.123297,0.683058,-1.492419,-0.094636,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0
1,-0.536103,-0.369026,-0.335335,-0.230177,0.505376,-0.522977,-0.006963,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0
2,0.148821,-0.380366,-0.301520,-0.282502,0.683058,0.169482,0.055627,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0
3,-0.298361,-0.329487,-0.335514,-0.258848,-0.560711,-0.245993,0.016423,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0
4,-1.390843,-0.361408,-0.325133,-0.201263,-0.738393,-1.353927,-0.024703,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75595,-0.173829,-0.381017,-0.243724,-0.282502,-0.916074,-0.107501,0.089331,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
75596,-0.247416,0.512344,-0.335514,0.215003,1.038420,-0.245993,-0.371810,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0
75597,1.190358,0.288776,0.016508,-0.282502,-0.738393,1.277416,0.241084,0.0,1.0,0.0,0.0,0.0,1.0,0.0,4.0,1
75598,0.007307,-0.346507,-0.317303,-0.282502,0.505376,0.030990,0.046423,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,1


In [51]:
train_data_processed.to_csv("../data/train_processed_data.csv")