<a href="https://colab.research.google.com/github/Enam88/Fraud_detection/blob/main/Fraud_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd /content/drive/My Drive/Colab Notebooks

In [None]:
# Most of the raw features we had don't seem to correlate so well with fraud vs not fraud

# [Featre Engineering]
# The time difference could be useful
# The country information could also be useful

In [60]:
from utils import prepare_data, cost_func
X_train, X_test, y_train, y_test = prepare_data()

In [61]:
# Assume a Naive Model
from sklearn.dummy import DummyClassifier

model_naive = DummyClassifier(strategy="constant", constant=1).fit(X_train, y_train)


In [None]:
# What is our problem?
# Well, we want to detect fraud in commercial transactions
# So we can either say something is Fraud or not Fraud

In [None]:
# We can say that it's really important that we identify all instances of fraud
# So in general, it's OK if we misclassify a transaction as fraudulent when it's not really
# If we call a fraudulent transaction as +
# Then what do we care about more? Precision or Recall?
# Given our problem statement, we want high recall

In [62]:
from utils import evaluate_model

In [63]:
evaluate_model(model_naive, X_train, y_train, "Training")

Training recall: 1
Training F-1 score: 0.172046


In [64]:
evaluate_model(model_naive, X_test, y_test, "Testing")

Testing recall: 1
Testing F-1 score: 0.168081


In [65]:
cost_func(model_naive, X_train, y_train) / X_train.shape[0]

43.309110010009185

In [66]:
cost_func(model_naive, X_test, y_test) / X_test.shape[0]

43.16282301558416

In [None]:
# our first model will be built using the features:
# purchse_value, age, source, browser, and sex

##First Serious Model

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [None]:
# We have numerical and categorical data

In [None]:
pipe_num = Pipeline([
    ("scaler", StandardScaler()),
])

pip_cat = Pipeline([
    ("encoder", OneHotEncoder()),
])

In [None]:
model.fit(X_train, y_train)

In [None]:
evaluate_model(model, X_train, y_train, "Training")

In [None]:
evaluate_model(model, X_test, y_test, "Testing")

In [None]:
cost_func(model, X_train, y_train) / X_train.shape[0]

In [None]:
cost_func(model, X_test, y_test) / X_test.shape[0]

## Feature Engineering

In [None]:
# Repeat customer/device
# Time diff
# Country

##Time Difference

In [None]:
X_train.head()

In [None]:
import pandas as pd
(pd.to_datetime(X_train['purchase_time']) - pd.to_datetime(X_train['signup_time'])).dt.total_seconds()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TimeDiff(BaseEstimator, TransformerMixin):
    def __init__(self, col_1, col_2, fmt=None):
        self.col_1 = col_1
        self.col_2 = col_2
        self.fmt = fmt
    
    def _convert_to_datetime(self, series):
        return pd.to_datetime(series, format=self.fmt)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        t_1 = self._convert_to_datetime(X[self.col_1])
        t_2 = self._convert_to_datetime(X[self.col_2])
        return (t_2 - t_1).dt.total_seconds().values.reshape(-1,1)

In [None]:
TimeDiff('signup_time', 'purchase_time').fit_transform(X_train)

In [None]:
from custom_estimator import IdentifyRepeats

In [None]:
selector = ColumnTransformer([
                   ("numerical", pipe_num, ["age", "purchase_value"]),
                   ("categorical", pip_cat, ["sex", "browser", "source"]),
                   ("repeated_devices", IdentifyRepeats(), "device_id"),
                   ("time_diff", TimeDiff("signup_time", "purchase_time"), ["signup_time", "purchase_time"])                                                 
                   ])


model = Pipeline([
                  ("selector",selector),
                  ("Sclaer", StandardScaler()),
                  ("classifier", LogisticRegression(class_weight="balanced"))
])

In [None]:
model.fit(X_train, y_train)

In [None]:
evaluate_model(model, X_train, y_train, "Training")

In [None]:
evaluate_model(model, X_test, y_test, "Testing")

In [None]:
cost_func(model,X_train, y_train) / X_train.shape[0]

In [None]:
cost_func(model,X_test, y_test) / X_test.shape[0]

In [None]:
X_train['device_id'].nunique()

In [None]:
X_train.shape

In [None]:
repeated_device = set((X_train['device_id'].value_counts() > 1).to_frame().query("device_id == True").index)

In [None]:
X_train['device_id'].isin(repeated_device).sum()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class IdentifyRepeats(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        #construct repeated devices
        self.repeated_devices = set(
            (X.value_counts() > 1)
            .to_frame()
            .query("device_id == True")
            .index
            
            )

        return self

    def transform(self, X):
        """X is a series, FYI"""
        return X.isin(self.repeated_devices).values.reshape(-1,1)

In [None]:
IdentifyRepeats().fit_transform(X_train['device_id']).sum()

In [None]:
IdentifyRepeats().fit(X_train['device_id']).transform(X_test['device_id'])

In [None]:
model = Pipeline([
                  ("selector", ColumnTransformer([
                      ("numerical", pipe_num, ["age", "purchase_value"]),
                      ("categorical", pip_cat, ["sex", "browser", "source"]),
                      ("repeated_devices", IdentifyRepeats(), "device_id")
                  ])),
                  ("classifier", LogisticRegression(class_weight="balanced"))


])

In [None]:
model.fit(X_train, y_train)

In [None]:
evaluate_model(model, X_train, y_train, "Training")

In [None]:
evaluate_model(model, X_test, y_test, "Testing")

In [None]:
y_pred = model.predict(X_train)

In [None]:
y_pred & ~y_train

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
X_train.loc[(~y_pred & y_train).index, "purchase_value"]

In [None]:
def cost_func(model, X, y_true):
    """
    Return cost of model based upon FP and FN
    Cost = 7 * FP + purchase value of each FN
    """
    y_pred = model.predict(X)
    FP = (y_pred & ~y_true).sum()

    # FN: we say it's NOT fraudulent (y=0) AND it's truely Fraudulent (y_true = 1)
    FN = X.loc[(~y_pred & y_true).index, "purchase_value"].sum()

    return 7 * FP + FN



In [None]:
cost_func(model, X_train, y_train) / X_train.shape[0]

In [None]:
cost_func(model_naive, X_train, y_train) / X_train.shape[0]

In [None]:
cost_func(model, X_test, y_test) / X_test.shape[0]

In [None]:
cost_func(model_naive, X_test, y_test) / X_test.shape[0]